In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk

pd.set_option('display.max_columns', 1000)

In [2]:
dataset = pd.read_pickle('other_xmltodict_all_major_fields.pkl')

In [3]:
### Drop the rows with no values in section_bodies
dataset = dataset[dataset['section_bodies'] != 'Null']
print("Without Null rows: " + str(len(dataset)))

Without Null rows: 2520


In [4]:
#only keep the SOR and CRC...SI are not included in the scope
dataset['regulation_type'].value_counts()

SOR     2265
SI       246
Null       9
Name: regulation_type, dtype: int64

In [5]:
dataset = dataset[dataset['regulation_type']!= 'SI']

## 1. Creating the Corpus & Sparse Matrix

In [6]:
def make_corpus(dataset, column):
    '''This function reads in a pandas dataframe (dataset) and the column name (e.g. 'text_column'). 
    Then it strips all punctuation, makes it lower case, tokenizes it, and removes english stopwords. 
    Finally it returns two lists (either directly or via tuple unpacking. the first is the documents
    without stopwords; the second is the documents stemmed via porterstemmer.
    '''
    from nltk.corpus import stopwords
    from nltk.tokenize import word_tokenize
    stop_words = set(stopwords.words('english'))
    import string 
    import sys
    translator = str.maketrans('', '', string.punctuation)
    import re
    from nltk.stem import PorterStemmer
    ps = PorterStemmer()
    filtered_text_list = []
    stemmed_text_list = []
    for i in range (0, len(dataset)):
        text = dataset.iloc[i][column].lower().translate(translator)
        text = re.sub('\n', ' ', text)
        word_tokens = word_tokenize(text) 
        filtered_text = [w for w in word_tokens if not w in stop_words]
        filtered_text = []
        for w in word_tokens:
            if w not in stop_words:
                filtered_text.append(w)
        stemmed_text = []
        for w in filtered_text:
            stemmed_text.append(ps.stem(w))
        filtered_text_list.append(str(filtered_text).translate(translator))
        stemmed_text_list.append(str(stemmed_text).translate(translator))
        sys.stdout.write("\r" + "Creating Corpus.. Processing Record: " + str(i+1) + " of " + str(len(dataset)))
        sys.stdout.flush()
    print("\nDONE!")
    return filtered_text_list, stemmed_text_list

In [7]:
filtered_text, stemmed_text = make_corpus(dataset,'section_bodies')

Creating Corpus.. Processing Record: 2274 of 2274
DONE!


In [8]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(stemmed_text).toarray()

In [9]:
X.shape

(2274, 12595)

In [10]:
df_sparse = pd.DataFrame(X, columns = cv.get_feature_names())

## 2. Cleaning the Sparse Matrix

In [11]:
features = pd.Series(df_sparse.columns)

##### Removing features which contain a number

In [12]:
def hasNumbers(inputString):
    return any(char.isdigit() for char in inputString)

In [13]:
#remove all the features which are numeric
index = features.map(hasNumbers)

In [14]:
#how many are numeric?
index.value_counts()

False    7512
True     5083
dtype: int64

In [15]:
#list of features to drop
dropped_features = features[index]

In [16]:
len(dropped_features)

5083

In [17]:
#Drop it like its hot
df_sparse = df_sparse.drop(columns=dropped_features)

In [18]:
df_sparse.shape

(2274, 7512)

##### Removing the helping verbs

In [19]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [20]:
#There are 23 helping verbs in total
helping_verbs = ['am', 'is', 'are', 'was', 'were', 'being', 'been', 'be', 'have', 'has', 'had', 'do', 'does', 
                 'did', 'will', 'would', 'shall', 'should', 'may', 'might', 'must', 'can', 'could']

#Obtain the stemed version of each word
helping_verbs_stemmed = []
for verb in helping_verbs:
    helping_verbs_stemmed.append(ps.stem(verb).lower())

In [21]:
#Which helping verbs are in the sparse matrix?
helping_verbs_stemmed_drop = []

cols = list(df_sparse.columns)

for verb in helping_verbs_stemmed:
    if verb in cols:
        helping_verbs_stemmed_drop.append(verb)
        
print (helping_verbs_stemmed_drop)

['will', 'would', 'shall', 'may', 'might', 'must', 'could']


In [22]:
#Remove the helping verbs and check the outcomes
print ('shape before dropping helping verbs:' + str(df_sparse.shape))

df_sparse = df_sparse.drop(columns=helping_verbs_stemmed_drop)

print ('shape after dropping helping verbs:' + str(df_sparse.shape))

shape before dropping helping verbs:(2274, 7512)
shape after dropping helping verbs:(2274, 7505)


## 3. TF-IDF Vectorizer

In [23]:
#convert the count matrix to tf-idf representation
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(df_sparse).toarray()

In [24]:
#df_sparse = pd.DataFrame(tfidf, columns = cols)

In [25]:
feature_names = df_sparse.columns

# find maximum value for each of the features over all of dataset:
max_val = df_sparse.max(axis=0).ravel()

#sort weights from smallest to biggest and extract their indices 
sort_by_tfidf = max_val.argsort()

print("Features with lowest tfidf:\n{}".format(
      feature_names[sort_by_tfidf[:20]]))

print("\nFeatures with highest tfidf: \n{}".format(
      feature_names[sort_by_tfidf[-20:]]))

Features with lowest tfidf:
Index(['northwestel', 'speech', 'premiumpay', 'spednic', 'spectat',
       'discredit', 'prepaidport', 'discriminatori', 'sphere', 'iran', 'ipc',
       'ioniz', 'ionexchang', 'dish', 'dishonest', 'sovereignti', 'ioc',
       'sparkler', 'discomfort', 'discolor'],
      dtype='object')

Features with highest tfidf: 
Index(['appeal', 'particular', 'system', 'categori', 'licenc', 'produc',
       'hazard', 'act', 'particip', 'certif', 'ferri', 'store', 'vessel',
       'subsect', 'dive', 'suppli', 'invest', 'food', 'plan', 'explos'],
      dtype='object')


## 4. Latent Semantic Analysis

In [26]:
from sklearn.decomposition import TruncatedSVD
lsa = TruncatedSVD(n_components = 100, algorithm='arpack', random_state=0)
dtm_lsa = lsa.fit_transform(tfidf)

In [27]:
#What are the top components of each concept?
for i, comp in enumerate(lsa.components_):
    terms_in_comp = zip(cols,comp)
    sorted_terms = sorted(terms_in_comp, key=lambda x: x[1], reverse=True)[:10]
    print('Concept %d:' %i)
    
    for term in sorted_terms:
        print(term[0])
    print (' ')
    

Concept 0:
forbid
come
datum
regi
registrar
lamproom
optim
nwtsoirc
secretari
perpendicular
 
Concept 1:
lamproom
nwtsoirc
plywood
element
airport
build
stripe
appli
grown
oviduct
 
Concept 2:
board
act
perpendicular
subrul
marker
commod
author
poult
secretari
purg
 
Concept 3:
board
commod
marker
poult
exempt
protrud
interpret
tp
optim
exponentialtimeaverag
 
Concept 4:
immov
prisonerofwar
articl
convent
remarri
canada
ordinari
optim
extens
grandpar
 
Concept 5:
remarri
grandpar
gone
implic
condit
optim
custodian
pave
paddleboat
dustproof
 
Concept 6:
compani
bank
entireti
remarri
hoist
grandpar
insul
invert
acquir
subsidiari
 
Concept 7:
gone
station
canada
implic
registrar
chapter
exponentialtimeaverag
come
forbid
enabl
 
Concept 8:
permafrost
exponentialtimeaverag
gone
author
regi
implic
optim
compani
scare
bank
 
Concept 9:
insul
proclaim
miner
loadcarri
lever
applic
board
lend
compani
prioriti
 
Concept 10:
column
servant
scare
proclaim
lever
italic
amount
paus
font
pave
 
Concep

In [28]:
list_concepts = []
    
for i in range(0,len(lsa.components_)):
    list_concepts.append('Concept %d' %i)
    
df_lsa = pd.DataFrame(dtm_lsa, columns=list_concepts)

df_lsa

Unnamed: 0,Concept 0,Concept 1,Concept 2,Concept 3,Concept 4,Concept 5,Concept 6,Concept 7,Concept 8,Concept 9,Concept 10,Concept 11,Concept 12,Concept 13,Concept 14,Concept 15,Concept 16,Concept 17,Concept 18,Concept 19,Concept 20,Concept 21,Concept 22,Concept 23,Concept 24,Concept 25,Concept 26,Concept 27,Concept 28,Concept 29,Concept 30,Concept 31,Concept 32,Concept 33,Concept 34,Concept 35,Concept 36,Concept 37,Concept 38,Concept 39,Concept 40,Concept 41,Concept 42,Concept 43,Concept 44,Concept 45,Concept 46,Concept 47,Concept 48,Concept 49,Concept 50,Concept 51,Concept 52,Concept 53,Concept 54,Concept 55,Concept 56,Concept 57,Concept 58,Concept 59,Concept 60,Concept 61,Concept 62,Concept 63,Concept 64,Concept 65,Concept 66,Concept 67,Concept 68,Concept 69,Concept 70,Concept 71,Concept 72,Concept 73,Concept 74,Concept 75,Concept 76,Concept 77,Concept 78,Concept 79,Concept 80,Concept 81,Concept 82,Concept 83,Concept 84,Concept 85,Concept 86,Concept 87,Concept 88,Concept 89,Concept 90,Concept 91,Concept 92,Concept 93,Concept 94,Concept 95,Concept 96,Concept 97,Concept 98,Concept 99
0,0.133869,0.009151,0.163808,-0.116088,0.016744,0.011887,-0.094306,-0.011585,0.037939,-0.175331,0.345300,0.151291,-0.149515,-0.043991,0.011330,0.046791,0.063265,-0.100130,0.070397,0.131186,-0.072668,-0.110688,-0.072583,0.043131,0.038753,0.002544,0.274045,-0.127520,0.099704,-0.032315,-0.063223,0.309143,0.136656,-0.009330,0.032651,-0.079209,0.078531,0.020441,-0.059283,-0.155243,-0.024651,0.125597,-0.050829,0.102534,-0.008854,0.054635,0.039810,-0.029672,0.093739,-0.084810,0.026407,-0.097363,0.053123,-0.025429,-0.029235,0.063678,0.037002,3.623601e-03,-0.013142,0.096728,-0.028526,0.068509,0.073763,0.035280,-0.023833,-0.027624,0.030543,-0.012879,0.003534,-0.010372,0.008721,0.031634,0.058234,0.008837,-0.020180,0.041570,-0.048314,0.019509,-0.037781,0.081859,0.018719,-0.050301,0.039456,-0.063776,-0.015486,0.005744,-0.003512,0.023557,-0.032751,-0.015980,0.016520,-0.048725,0.010458,-0.007559,-0.024895,-0.029319,0.008756,-0.019034,0.002844,0.016143
1,0.040606,0.006009,0.050907,-0.036570,0.023986,0.006046,-0.016135,0.031183,-0.006259,0.007440,0.059679,0.014844,-0.005533,0.001092,0.001951,0.015375,0.006001,-0.015494,0.043211,-0.017797,-0.005461,-0.034303,-0.016443,0.032498,0.009691,-0.012396,0.013555,0.027627,0.015275,0.027867,-0.043693,-0.030031,-0.001740,0.020797,0.010958,-0.012952,0.007577,-0.003907,-0.000254,-0.007110,0.003372,-0.002505,0.008646,0.029338,0.017723,-0.032044,0.036203,0.037802,0.013838,0.048920,0.032897,0.020530,-0.003634,0.021591,0.008657,-0.034911,-0.030603,-2.291098e-02,0.002577,-0.105327,-0.004768,-0.064606,0.009019,0.046242,0.038407,0.032114,0.028951,-0.013300,0.005073,0.002715,-0.030343,-0.022097,-0.033708,0.050182,-0.027979,-0.040575,0.071214,-0.041731,0.059895,0.005634,0.021192,0.022351,0.017944,0.071310,0.002616,0.021564,0.000063,-0.032226,-0.000331,-0.002474,0.040784,0.023176,0.004360,0.007491,0.083713,0.050721,-0.071069,-0.053899,-0.040355,0.032900
2,0.431893,0.737588,-0.145732,0.047491,0.031886,0.017033,0.013733,-0.013343,0.042420,-0.006422,0.014444,0.018418,0.007394,0.022134,0.006597,0.003155,-0.022483,0.030875,-0.001983,0.046125,-0.020644,-0.036410,-0.022121,-0.047174,0.007922,0.084058,0.003500,0.032514,-0.029697,0.029179,-0.000315,-0.003458,0.011764,-0.002432,0.003308,-0.003573,-0.017009,0.006628,-0.017504,-0.021680,-0.002770,0.009269,-0.004329,0.024108,-0.046424,0.009715,0.003663,0.038887,-0.033378,0.011762,-0.026898,-0.015461,0.002626,-0.015188,-0.030096,-0.016173,-0.006494,2.797625e-02,-0.048303,0.006331,-0.027902,0.010210,-0.005972,-0.036171,0.022431,-0.021606,0.021355,0.006078,0.030416,-0.022505,0.025521,-0.029698,-0.011564,0.005830,0.006485,0.003723,-0.011735,0.042538,0.015574,0.009830,-0.012730,0.051077,0.017495,0.004754,-0.036551,0.011860,0.029957,-0.003741,0.001877,-0.012793,0.012078,0.000971,-0.005438,-0.002583,-0.011776,-0.007992,-0.018537,-0.025496,-0.005225,-0.005733
3,0.075667,-0.005102,0.106623,-0.051184,0.139599,0.183566,0.072486,0.060194,-0.020454,0.010968,0.064791,0.000753,0.028741,-0.044031,-0.010114,-0.017043,0.016428,-0.047793,-0.035333,0.030663,0.007207,0.043488,0.023907,-0.042898,-0.008026,0.005911,-0.003388,-0.105569,0.060749,0.099581,-0.017028,-0.065216,0.029912,-0.003747,-0.039052,-0.079460,-0.018181,0.018865,0.112351,-0.070946,-0.004434,-0.035039,0.016074,-0.033543,-0.087644,0.054933,-0.037484,0.019584,0.000312,0.049908,0.070301,0.039173,-0.018513,-0.110972,0.026745,-0.014226,0.002274,-6.983448e-07,0.014850,-0.002192,0.067706,0.036263,0.049503,-0.064573,-0.027093,-0.052044,0.058591,-0.071956,0.037549,0.044328,-0.018073,-0.050359,0.029415,0.059310,-0.018191,-0.021483,0.013101,0.003222,-0.055173,0.032673,-0.020164,0.126231,-0.037242,0.013767,0.020277,-0.026032,-0.037974,-0.057926,-0.050052,-0.035545,0.007719,0.003841,0.058499,0.027588,-0.057169,0.007321,-0.047575,0.045228,0.055001,-0.101002
4,0.142396,0.036921,0.242153,0.132612,-0.046667,0.021654,-0.116130,0.045910,0.048684,0.174931,0.122105,-0.125720,0.129946,-0.130873,-0.114482,-0.047732,0.047013,-0.029853,-0.126403,-0.000701,0.070188,0.040653,0.071611,-0.004345,0.064729,-0.039231,-0.081132,0.003513,-0.014786,-0.031547,-0.130679,0.126522,-0.269005,-0.042484,-0.155965,-0.076379,0.009915,-0.090987,-0.024105,-0.090712,-0.050904,-0.022816,0.008771,0.031730,0.024674,0.124548,-0.040160,-0.011523,0.088245,-0.032403,0.055337,-0.011304,-0.161427,-0.056792,-0.040389,0.028920,0.110110,2.502277e-02,0.093665,0.039658,-0.014694,0.034716,-0.082729,-0.072716,0.027917,0.067755,0.023394,-0.043956,-0.082715,0.010951,-0.049663,-0.026945,-0.077780,-0.055605,0.028247,0.051371,-0.021272,-0.012269,0.001073,-0.010459,0.003736,0.077855,0.015034,-0.015023,-0.024300,-0.021072,-0.069764,-0.012520,-0.002899,0.013316,0.066383,-0.074390,-0.030195,-0.015220,0.072249,0.027629,0.015259,-0.036711,-0.004592,0.030156
5,0.456751,0.824029,-0.203078,0.047288,0.009718,0.006198,0.022772,-0.014193,-0.018621,0.029917,-0.021816,-0.043931,-0.024599,0.002969,0.020584,-0.005542,-0.001178,0.006330,0.011719,0.000796,-0.012100,-0.000612,-0.008047,0.003597,0.000666,-0.008613,-0.003630,0.015896,-0.005024,-0.025293,0.013144,0.014465,-0.000557,0.034802,0.030923,-0.044946,0.062470,0.010801,0.007267,0.004093,-0.003429,-0.023602,0.028999,-0.030116,0.089343,-0.033663,-0.026939,-0.039293,0.056956,0.012624,0.021317,0.042599,0.001034,-0.015444,0.025165,0.005356,0.025235,-1.344819e-02,0.011662,-0.007713,0.005401,0.017185,-0.005116,0.016463,-0.006812,0.009440,-0.015113,-0.013362,0.019971,0.000770,0.008526,-0.002428,0.006369,-0.003355,0.008930,-0.007088,0.008743,-0.014458,-0.010869,-0.007054,0.019089,-0.034168,-0.011351,-0.003131,0.002287,-0.008117,-0.023882,0.004480,-0.004207,-0.004922,-0.007905,-0.010016,0.005569,-0.004926,-0.003882,0.003674,-0.001456,0.009776,0.007271,0.001879
6,0.188825,-0.009331,0.467798,0.697329,-0.008610,-0.068486,0.080660,0.008878,0.000430,-0.077546,-0.066352,0.065223,-0.062398,0.015363,0.025164,0.019555,-0.010224,0.027214,0.037129,-0.008886,0.031895,-0.031115,-0.025751,-0.049751,-0.012805,-0.073546,0.006310,0.016522,-0.030142,-0.018001,-0.059291,0.027297,-0.018509,0.008740,-0.019910,-0.022758,0.004476,-0.021836,0.015881,0.014955,0.023503,-0.006614,-0.004432,-0.011003,-0.000237,0.021900,-0.019456,0.011538,-0.011788,0.033784,-0.050039,0.042574,0.039299,-0.023866,0.030502,0.048003,-0.076252,-7.450090e-02,-0.020806,0.034232,-0.041017,0.013170,0.092365,-0.015176,0.007263,0.039641,-0.103279,-0.004428,-0.022356,-0.008360,0.034363,0.013184,-0.010099,0.051220,-0.093213,-0.013868,-0.053093,0.116408,0.118234,-0.033487,-0.013036,0.010184,-0.090023,-0.007390,-0.078396,0.018326,-0.084987,-0.054707,-0.102325,0.014667,0.039226,-0.023356,-0.022650,-0.005890,-0.004421,-0.001198,0.044165,-0.013884,-0.026741,0.018397
7,0.166388,-0.011019,0.171656,-0.137692,-0.138979,-0.085902,0.088595,-0.094685,0.109557,-0.010233,-0.056658,-0.032311,0.050825,-0.065944,0.034910,-0.100174,0.131403,-0.046396,0.062169,0.055555,-0.087007,-0.141738,0.248164,0.016370,0.087743,-0.098788,-0.081071,-0.050904,0.024019,0.074936,-0.033841,0.026808,0.020836,0.034540,0.143894,-0.019160,0.042721,-0.030402,0.024041,-0.002424,-0.014913,-0.028573,-0.056969,0.033149,0.018063,0.013046,-0.037623,0.057402,-0.148304,0.102096,-0.012425,-0.006474,-0.002220,0.008910,-0.012586,0.051632,-0.111967,-3.452097e-02,0.059802,-0.072850,-0.043851,-0.002320,-0.068730,0.009344,0.054232,0.048287,-0.010141,0.029512,0.026374,-0.020820,0.068829,-0.011480,-0.003710,-0.039507,0.036701,-0.015736,0.008371,-0.076963,-0.119887,0.050104,0.055248,0.058653,-0.052702,0.060124,-0.080449,-0.018296,0.056712,0.049745,-0.033298,0.052081,0.028644,0.004947,0.019453,-0.086217,0.014893,-0.017264,0.051728,-0.099141,-0.037177,0.021107
8,0.102865,0.001962,0.112469,-0.091816,-0.050959,-0.029702,0.041936,-0.031372,0.033931,0.029093,0.023464,-0.009452,-0.022533,-0.056700,-0.057963,0.068494,-0.123061,-0.034318,-0.050030,-0.024463,-0.022754,0.020726,-0.040923,0.016035,-0.079160,-0.020657,-0.032343,-0.021383,-0.024264,-0.011839,-0.019190,-0.002440,0.023940,0.005944,0.033573,-0.034089,0.023460,0.034677,-0.030802,-0.000993,0.017368,-0.015165,0.000086,-0.032654,-0.048733,0.014331,0.060031,-0.014264,-0.026440,0.001954,0.008909,0.043345,-0.016073,-0.016802,0.036651,-0.003149,0.033559,-1.240359e-02,-0.007625,0.045861,0.021532,-0.009151,0.051065,0.006943,0.024998,-0.003010,-0.063079,-0.000774,0.017797,-0.006691,0.016302,-0.076859,0.007255,-0.031136,0.029598,0.017267,-0.010118,-0.024103,-0.011273,0.004081,-0.001357,0.015480,0.022838,0.003712,-0.008416,-0.041805,0.023081,0.040087,-0.044391,0.056124,0.009938,-0.010930,0.032042,-0.005822,0.009872,0.011312,-0.025116,-0.015509,-0.042898,-0.002465
9,0.065935,0.005299,0.162254,0.119862,-0.024922,0.000359,0.074706,0.037101,-0.047192,-0.166096,0.008807,0.040497,-0.012254,0.057452,0.056934,-0.062365,0.001168,0.037480,-0.130906,0.016384,0.058536,0.051021,-0.012430,0.119143,-0.013118,0.022056,-0.039753,-0.041370,-0.037346,0.106663,0.039094,-0.161784,0.020120,-0.006705,0.010179,0.013721,0.142294,-0.113523,-0.115834,-0.020806,0.029121,-0.043576,0.013636,0.004356,0.100732,-0.006397,-0.099716,0.145945,-0.062821,0.106819,-0.079367,0.018046,-0.015066,-0.057209,0.034478,0.112381,-0.027846,-4.215635e-02,-0.015967,0.156527,-0.068910,-0.062422,0.035549,-0.015271,-0.060906,-0.051610,0.032632,-0.132915,-0.054841,-0.076990,-0.016521,0.032097,0.009016,-0.054726,-0.135605,0.027357,0.144364,0.032565,-0.053163,-0.027869,0.066127,-0.031944,0.114751,-0.175003,-0.030496,0.013152,0.015850,-0.034143,0.073376,-0.015280,-0.032065,0.041327,-0.074125,0.034916,-0.016674,-0.008509,-0.064650,-0.023966,0.060663,0.061955


###### Document Similarity using LSA

In [29]:
from sklearn.preprocessing import Normalizer
df_norm = Normalizer(copy=False).fit_transform(dtm_lsa)

In [30]:
similarity = np.asarray(np.asmatrix(df_norm) * np.asmatrix(df_norm).T)
regs = list(dataset['instrument_number'])

In [31]:
df_heatmap = pd.DataFrame(similarity,index=regs, columns=regs)

In [32]:
#df_heatmap

In [33]:
'''
After color coding in excel, it can be easily seen which regulatiosn are very similar to eachother. Upon manually
opening a couple, it can be seen that often regulations will attempt to convey similar (if not identical) meanings
with inconsistent language
'''

#export to excel for analysis...seaborn heatmap takes too long to load on my machine
df_heatmap.to_excel('LSA_Doc_Similarity_v1.xlsx')

## 5. Extracting the highly similar documents

In [37]:
#minimum score/value for regulations to be considered similar
thresh_min = 0.95

#max score/value, above which regulations are the same document
thresh_max = 0.999999

In [38]:
import sys

In [39]:
#column names 
regs = list(df_heatmap.columns)

#master list of similar regs to append to
similar_regs = []

#analyze each column's scores one at a time
for reg_1 in regs:
    reg_1_scores = list(df_heatmap[reg_1])
    
    #track the index value (integer) which meet the criteria
    index_value = 0
    list_index = []

    for score in reg_1_scores:
        if (score < thresh_max) and (score < thresh_max):
            list_index.append(index_value)
        index_value += 1
    
    #track the reg pairs which exceed the criteria - check for duplicates
    for i in list_index:
        reg_2 = df_heatmap.index[i]
        
        if [reg_2,reg_1] not in similar_regs:
            similar_regs.append([reg_1,reg_2])
            
    print("\nDONE!") 

Creating Corpus.. Processing Record: 2274 of 2274
DONE!
Creating Corpus.. Processing Record: 2274 of 2274
DONE!
Creating Corpus.. Processing Record: 2274 of 2274
DONE!
Creating Corpus.. Processing Record: 2274 of 2274
DONE!
Creating Corpus.. Processing Record: 2274 of 2274
DONE!
Creating Corpus.. Processing Record: 2274 of 2274
DONE!
Creating Corpus.. Processing Record: 2274 of 2274
DONE!
Creating Corpus.. Processing Record: 2274 of 2274
DONE!
Creating Corpus.. Processing Record: 2274 of 2274
DONE!
Creating Corpus.. Processing Record: 2274 of 2274
DONE!
Creating Corpus.. Processing Record: 2274 of 2274
DONE!
Creating Corpus.. Processing Record: 2274 of 2274
DONE!
Creating Corpus.. Processing Record: 2274 of 2274
DONE!


KeyboardInterrupt: 

In [None]:
sys.stdout.write("\r" + "Creating Corpus.. Processing Record: " + str(index_value) + " of " + str(len(regs)))
        sys.stdout.flush()
    print("\nDONE!")

##### Checking the outputs

Given that the formula for the number of possible cominations where order doesn't matter:

C(n,r)=n!/(n−r)!r!

In [None]:
for regs_1 in regs:
    reg_1_scores = list(df_heatmap[reg_1])

In [None]:
for score in reg_1_scores:
    print (score)

In [None]:
list_index

## Which Lebels in the Regs are Most Similar?

In [None]:
for regs_1 in regs:
    reg_1_scores = list(df_heatmap[reg_1])

In [None]:
len(reg_1_scores)

In [None]:
len(test)

In [None]:
len(regs)

In [None]:
df_heatmap[regs]