In [0]:
#https://sites.temple.edu/tudsc/2017/03/30/measuring-similarity-between-texts-in-python/

In [0]:
d1 = "plot: two teen couples go to a church party, drink and then drive."
d2 = "films adapted from comic books have had plenty of success , whether they're about superheroes ( batman , superman , spawn ) , or geared toward kids ( casper ) or the arthouse crowd ( ghost world ) , but there's never really been a comic book like from hell before . "
d3 = "every now and then a movie comes along from a suspect studio , with every indication that it will be a stinker , and to everybody's surprise ( perhaps even the studio ) the film becomes a critical darling . "
d4 = "damn that y2k bug . "
d5 = "plot: two teen couples go to a church party, drink and then drive."

In [0]:
documents = [d1, d2, d3, d4, d5]

In [0]:
type(documents)

list

In [0]:
documents

['plot: two teen couples go to a church party, drink and then drive.',
 "films adapted from comic books have had plenty of success , whether they're about superheroes ( batman , superman , spawn ) , or geared toward kids ( casper ) or the arthouse crowd ( ghost world ) , but there's never really been a comic book like from hell before . ",
 "every now and then a movie comes along from a suspect studio , with every indication that it will be a stinker , and to everybody's surprise ( perhaps even the studio ) the film becomes a critical darling . ",
 'damn that y2k bug . ']

# Normalize

## Stemming

In [0]:
import nltk, string, numpy

nltk.download('punkt') # first-time use only

stemmer = nltk.stem.porter.PorterStemmer()

[nltk_data] Downloading package punkt to /content/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [0]:
def StemTokens(tokens):
  return [stemmer.stem(token) for token in tokens]

remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)

def StemNormalize(text):
  return StemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))

## Lemmatization

In [0]:
nltk.download('wordnet') # first-time use only

lemmer = nltk.stem.WordNetLemmatizer()

[nltk_data] Downloading package wordnet to /content/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [0]:
def LemTokens(tokens):
  return [lemmer.lemmatize(token) for token in tokens]

remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)

def LemNormalize(text):
  return LemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))

## Text to vectors of term frequency

In [0]:
from sklearn.feature_extraction.text import CountVectorizer

LemVectorizer = CountVectorizer(tokenizer=LemNormalize, stop_words='english')
LemVectorizer.fit_transform(documents)

<4x41 sparse matrix of type '<class 'numpy.int64'>'
	with 42 stored elements in Compressed Sparse Row format>

In [0]:
LemVectorizer.vocabulary_

{'adapted': 0,
 'arthouse': 1,
 'batman': 2,
 'book': 3,
 'bug': 4,
 'casper': 5,
 'church': 6,
 'come': 7,
 'comic': 8,
 'couple': 9,
 'critical': 10,
 'crowd': 11,
 'damn': 12,
 'darling': 13,
 'drink': 14,
 'drive': 15,
 'everybodys': 16,
 'film': 17,
 'geared': 18,
 'ghost': 19,
 'hell': 20,
 'indication': 21,
 'kid': 22,
 'like': 23,
 'movie': 24,
 'party': 25,
 'plenty': 26,
 'plot': 27,
 'really': 28,
 'spawn': 29,
 'stinker': 30,
 'studio': 31,
 'success': 32,
 'superheroes': 33,
 'superman': 34,
 'surprise': 35,
 'suspect': 36,
 'teen': 37,
 'theyre': 38,
 'world': 39,
 'y2k': 40}

In [0]:
tf_matrix = LemVectorizer.transform(documents).toarray()
tf_matrix

array([[0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
       [1, 1, 1, 2, 0, 1, 0, 0, 2, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
        1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1,
        0, 0, 1, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 1, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]])

## Get IDF

In [0]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidfTran = TfidfTransformer(norm = "l2")
tfidfTran.fit(tf_matrix)

tfidfTran.idf_

array([1.91629073, 1.91629073, 1.91629073, 1.91629073, 1.91629073,
       1.91629073, 1.91629073, 1.91629073, 1.91629073, 1.91629073,
       1.91629073, 1.91629073, 1.91629073, 1.91629073, 1.91629073,
       1.91629073, 1.91629073, 1.51082562, 1.91629073, 1.91629073,
       1.91629073, 1.91629073, 1.91629073, 1.91629073, 1.91629073,
       1.91629073, 1.91629073, 1.91629073, 1.91629073, 1.91629073,
       1.91629073, 1.91629073, 1.91629073, 1.91629073, 1.91629073,
       1.91629073, 1.91629073, 1.91629073, 1.91629073, 1.91629073,
       1.91629073])

In [0]:
import math

def idf(n,df):
    result = math.log((n+1.0)/(df+1.0)) + 1
    return result

print("The idf for terms that appear in one document: " + str(idf(4,1)))
print("The idf for terms that appear in two documents: " + str(idf(4,2)))

The idf for terms that appear in one document: 1.916290731874155
The idf for terms that appear in two documents: 1.5108256237659907


In [0]:
tfidf_matrix = tfidfTran.transform(tf_matrix)
tfidf_matrix.toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.37796447, 0.        , 0.        , 0.37796447,
        0.        , 0.        , 0.        , 0.        , 0.37796447,
        0.37796447, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.37796447, 0.        , 0.37796447, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.37796447, 0.        , 0.        ,
        0.        ],
       [0.19381304, 0.19381304, 0.19381304, 0.38762607, 0.        ,
        0.19381304, 0.        , 0.        , 0.38762607, 0.        ,
        0.        , 0.19381304, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.15280442, 0.19381304, 0.19381304,
        0.19381304, 0.        , 0.19381304, 0.19381304, 0.        ,
        0.        , 0.19381304, 0.        , 0.19381304, 0.19381304,
        0.        , 0.     

# SKLearn implementation

Scikit-learn actually has another function TfidfVectorizer that combines the work of CountVectorizer and TfidfTransformer, which makes the process more efficient.

In [0]:
from google.colab import files
uploaded = files.upload()

Saving RTM_SDIVM_27_Master_Data_Change_Management v16_R5.xlsx to RTM_SDIVM_27_Master_Data_Change_Management v16_R5.xlsx


In [0]:
import pandas as pd
data = pd.read_excel('RTM_SDIVM_27_Master_Data_Change_Management v16_R5.xlsx', sheet_name = 'L1 Requirements')
data = data[data.Status != 'Cancelled']
data['text'] = data['Description \n(Stakeholder: I want to <goal or need>)'].astype(str) + data['So that <Justification> ']
req = data[['Requirement ID', 'text']]
req = req.dropna(how = 'all')

req.shape

(107, 2)

In [0]:
req.isnull().any()

Requirement ID    False
text              False
dtype: bool

In [0]:
reqs = req['text'].values

In [0]:
import pandas as pd

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer

TfidfVec = TfidfVectorizer(tokenizer=LemNormalize, stop_words='english')

def cos_similarity(textlist):
    tfidf = TfidfVec.fit_transform(textlist)
    return (tfidf * tfidf.T).toarray()

similarity = pd.DataFrame(cos_similarity(reqs))

In [0]:
similarity.isnull().any()

0                 False
1                 False
2                 False
3                 False
4                 False
5                 False
6                 False
7                 False
8                 False
9                 False
10                False
11                False
12                False
13                False
14                False
15                False
16                False
17                False
18                False
19                False
20                False
21                False
22                False
23                False
24                False
25                False
26                False
27                False
28                False
29                False
                  ...  
78                False
79                False
80                False
81                False
82                False
83                False
84                False
85                False
86                False
87                False
88              

In [0]:
similarity['Requirement ID'] = req['Requirement ID']

#similarity.columns = similarity['Requirement ID']

similarity

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,98,99,100,101,102,103,104,105,106,Requirement ID
0,1.000000,0.059825,0.236718,0.122481,0.122457,0.019233,0.000000,0.014958,0.000000,0.000000,...,0.000000,0.000000,0.071214,0.000000,0.015410,0.015060,0.000000,0.000000,0.000000,27.1.1.17
1,0.059825,1.000000,0.245718,0.141825,0.093937,0.063173,0.039171,0.049132,0.029755,0.068376,...,0.069272,0.038225,0.036237,0.000000,0.161745,0.059075,0.027738,0.037899,0.000000,27.1.1.1
2,0.236718,0.245718,1.000000,0.204091,0.131672,0.055768,0.034579,0.043372,0.026267,0.030180,...,0.029438,0.000000,0.000000,0.000000,0.017044,0.016657,0.000000,0.000000,0.000000,27.1.1.9
3,0.122481,0.141825,0.204091,1.000000,0.168585,0.128889,0.061039,0.438133,0.377189,0.026637,...,0.025982,0.000000,0.000000,0.000000,0.015043,0.014701,0.000000,0.000000,0.000000,27.1.1.15
4,0.122457,0.093937,0.131672,0.168585,1.000000,0.073067,0.036621,0.028413,0.000000,0.000000,...,0.000000,0.032012,0.000000,0.000000,0.036101,0.035281,0.000000,0.031739,0.031490,27.1.1.16
5,0.019233,0.063173,0.055768,0.128889,0.073067,1.000000,0.793300,0.415599,0.248067,0.110624,...,0.032511,0.000000,0.049048,0.000000,0.000000,0.000000,0.037545,0.000000,0.101789,
6,0.000000,0.039171,0.034579,0.061039,0.036621,0.793300,1.000000,0.283677,0.433314,0.110889,...,0.115458,0.000000,0.083264,0.069279,0.000000,0.000000,0.037635,0.000000,0.102033,27.2.1.1
7,0.014958,0.049132,0.043372,0.438133,0.028413,0.415599,0.283677,1.000000,0.878106,0.086035,...,0.025285,0.000000,0.038146,0.000000,0.000000,0.000000,0.029200,0.000000,0.079165,27.2.1.1a
8,0.000000,0.029755,0.026267,0.377189,0.000000,0.248067,0.433314,0.878106,1.000000,0.084235,...,0.087706,0.000000,0.063250,0.052627,0.000000,0.000000,0.028589,0.000000,0.077508,27.2.1.2
9,0.000000,0.068376,0.030180,0.026637,0.000000,0.110624,0.110889,0.086035,0.084235,1.000000,...,0.000000,0.000000,0.047610,0.000000,0.000000,0.000000,0.036444,0.000000,0.098805,27.2.1.2b
