## Importing the necessary libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

#### Declaring the Variables and Parameters

In [2]:
SEED = 42

## Reading the data from files and processing it

In [3]:
# Paths to the Variations in P/T and National Codes
variation_folder_path = './Data/code-variations-2015'

alberta_building = variation_folder_path + '/CCT Codes Comparison Import Alberta Building v2.csv'
bc_buildingA = variation_folder_path + '/CCT Codes Comparison Import BC Building DIV A.csv'
bc_buildingB = variation_folder_path + '/CCT Codes Comparison Import BC Building Div B.csv'
energy = variation_folder_path + '/CCT Codes Comparison Import Energy Code.csv'
fire = variation_folder_path + '/CCT Codes Comparison Import Fire Code.csv'
nl_building = variation_folder_path + '/CCT Codes Comparison Import NL Building.csv'
on_building_1_and_3 = variation_folder_path + '/CCT Codes Comparison Import ON Building Part 1 and 3.csv'
on_building_4_to_7 = variation_folder_path + '/CCT Codes Comparison Import ON Building Part 4 to 7.csv'
on_building_9 = variation_folder_path + '/CCT Codes Comparison Import ON Building Part 9 v2.csv'
on_building_8_to_12_no_9 = variation_folder_path + '/CCT Codes Comparison Import ON Building Parts 8 10 11 12.csv'
pei_building = variation_folder_path + '/CCT Codes Comparison Import PEI Building.csv'
plumbing = variation_folder_path + '/CCT Codes Comparison Import Plumbing.csv'
qc_building = variation_folder_path + '/CCT Codes Comparison Import QC Building.csv'
sk_building = variation_folder_path + '/CCT Codes Comparison Import SK Building.csv'


# Paths to the full P/T and National Codes
full_2015_folder_path = './Data/code-full-2015'

full_national_2015 = full_2015_folder_path + '/National Codes 2015 sentences.xlsx'
full_pt_2015 = full_2015_folder_path + '/PT Sentence Data 2015.xlsx'

##### Function to read the data from the file, drop any rows that are completely empty, and print the shape of the dataframe

In [4]:
# Function to read in the data, drop any rows that are completely empty, and print the shape of the dataframe
def read_data(file_path, code_type):
    df = pd.read_csv(file_path, encoding='latin1')
    df.dropna(how='all', inplace=True)
    df['Code Type'] = code_type
    print(df.shape)
    print(df.columns)
    return df

##### Reading the Variations files into dataframes, adding a column with *Code Type*, and combining them

In [5]:
alberta_building_df = read_data(alberta_building, 'Building')

(775, 34)
Index(['Code Year', 'Province/Territory', 'Code Book', 'National Division',
       'National Sentence Number', 'National Article Title',
       'National Article Title (FR)', 'National Sentence Number (FR)',
       'National Sentence Text', 'P/T Document', 'Matched P/T Division',
       'Matched P/T Sentence Number', 'Matched P/T Article Title',
       'Matched P/T Article Title (FR)', 'Matched P/T Sentence Number (FR)',
       'Matched P/T Sentence Text', 'Text Difference Tracked',
       'Difference Type', 'Variation?', 'Variation Label', 'Exception?',
       'Comments', 'Code Part', 'Code Article', 'Code Section',
       'Code Subsection', 'Code Sentence', 'National Sentence Text (FR)',
       'Matched P/T Sentence Text (FR)', 'Text Difference Tracked (FR)',
       'Difference Type Updated?', 'Exception Updated?', 'Variation Updated?',
       'Code Type'],
      dtype='object')


In [6]:
bc_buildingA_df = read_data(bc_buildingA, 'Building')

(18, 27)
Index(['Code Year', 'Province/Territory', 'Code Book', 'National Division',
       'National Sentence Number', 'National Article Title',
       'National Sentence Text', 'National Article Title (FR)',
       'National Sentence Text (FR)', 'P/T Document', 'P/T Division',
       'P/T Sentence Number', 'P/T Article Title', 'P/T Sentence Text',
       'P/T Article Title (FR)', 'P/T Sentence Text (FR)', 'Difference Type',
       'Variation', 'Variation Label', 'Text Difference Tracked', 'Exception',
       'Comments', 'Text Difference Tracked (FR)', 'Difference Type Updated',
       'Exception Updated', 'Variation Updated', 'Code Type'],
      dtype='object')


In [7]:
bc_buildingB_df = read_data(bc_buildingB, 'Building')

(789, 34)
Index(['Code Year', 'Province/Territory', 'Code Book', 'National Division',
       'National Sentence Number', 'National Article Title',
       'National Article Title (FR)', 'National Sentence Number (FR)',
       'National Sentence Text', 'P/T Document', 'Matched P/T Division',
       'Matched P/T Sentence Number', 'Matched P/T Article Title',
       'Matched P/T Article Title (FR)', 'Matched P/T Sentence Number (FR)',
       'Matched P/T Sentence Text', 'Text Difference Tracked',
       'Difference Type', 'Variation?', 'Variation Label', 'Exception?',
       'Comments', 'Code Part', 'Code Article', 'Code Section',
       'Code Subsection', 'Code Sentence', 'National Sentence Text (FR)',
       'Matched P/T Sentence Text (FR)', 'Text Difference Tracked (FR)',
       'Difference Type Updated?', 'Exception Updated?', 'Variation Updated?',
       'Code Type'],
      dtype='object')


In [8]:
energy_df = read_data(energy, 'Energy')

(1389, 27)
Index(['Code Year', 'Province/Territory', 'Code Book', 'National Division',
       'National Sentence Number', 'National Article Title',
       'National Sentence Text', 'National Article Title (FR)',
       'National Sentence Text (FR)', 'P/T Document', 'P/T Division',
       'P/T Sentence Number', 'P/T Article Title', 'P/T Sentence Text',
       'P/T Article Title (FR)', 'P/T Sentence Text (FR)', 'Difference Type',
       'Variation', 'Variation Label', 'Text Difference Tracked', 'Exception',
       'Comments', 'Text Difference Tracked (FR)', 'Difference Type Updated',
       'Exception Updated', 'Variation Updated', 'Code Type'],
      dtype='object')


In [9]:
fire_df = read_data(fire, 'Fire')

(3965, 27)
Index(['Code Year', 'Province/Territory', 'Code Book', 'National Division',
       'National Sentence Number', 'National Article Title',
       'National Sentence Text', 'National Article Title (FR)',
       'National Sentence Text (FR)', 'P/T Document', 'P/T Division',
       'P/T Sentence Number', 'P/T Article Title', 'P/T Sentence Text',
       'P/T Article Title (FR)', 'P/T Sentence Text (FR)', 'Difference Type',
       'Variation', 'Variation Label', 'Text Difference Tracked', 'Exception',
       'Comments', 'Text Difference Tracked (FR)', 'Difference Type Updated',
       'Exception Updated', 'Variation Updated', 'Code Type'],
      dtype='object')


In [10]:
nl_building_df = read_data(nl_building, 'Building')

(206, 27)
Index(['Code Year', 'Province/Territory', 'Code Book', 'National Division',
       'National Sentence Number', 'National Article Title',
       'National Sentence Text', 'National Article Title (FR)',
       'National Sentence Text (FR)', 'P/T Document', 'P/T Division',
       'P/T Sentence Number', 'P/T Article Title', 'P/T Sentence Text',
       'P/T Article Title (FR)', 'P/T Sentence Text (FR)', 'Difference Type',
       'Variation', 'Variation Label', 'Text Difference Tracked', 'Exception',
       'Comments', 'Text Difference Tracked (FR)', 'Difference Type Updated',
       'Exception Updated', 'Variation Updated', 'Code Type'],
      dtype='object')


In [11]:
on_building_1_and_3_df = read_data(on_building_1_and_3, 'Building')

(1874, 27)
Index(['Code Year', 'Province/Territory', 'Code Book', 'National Division',
       'National Sentence Number', 'National Article Title',
       'National Sentence Text', 'P/T Document', 'P/T Division',
       'P/T Sentence Number', 'P/T Article Title', 'P/T Sentence Text',
       'Text Difference Tracked', 'Difference Type', 'Variation',
       'Variation Label', 'Exception', 'Comments',
       'National Article Title (FR)', 'National Sentence Text (FR)',
       'P/T Article Title (FR)', 'P/T Sentence Text (FR)',
       'Text Difference Tracked (FR)', 'Difference Type Updated',
       'Exception Updated', 'Variation Updated', 'Code Type'],
      dtype='object')


In [12]:
on_building_4_to_7_df = read_data(on_building_4_to_7, 'Building')

(1173, 27)
Index(['Code Year', 'Province/Territory', 'Code Book', 'National Division',
       'NationalSentenceNumber', 'National Article Title',
       'National Sentence Text', 'P/T Document', 'P/T Division',
       'P/TSentenceNumber', 'P/T Article Title', 'P/T Sentence Text',
       'Difference Type', 'Variation', 'Variation Label',
       'Text Difference Tracked', 'Exception', 'Comments',
       'National Article Title (FR)', 'National Sentence Text (FR)',
       'P/T Article Title (FR)', 'P/T Sentence Text (FR)',
       'Text Difference Tracked (FR)', 'Difference Type Updated',
       'Exception Updated', 'Variation Updated', 'Code Type'],
      dtype='object')


In [13]:
on_building_9_df = read_data(on_building_9, 'Building')

(1629, 27)
Index(['Code Year', 'Province/Territory', 'Code Book', 'National Division',
       'P/T Document', 'P/T Division', 'P/T Sentence Number',
       'P/T Article Title', 'P/T Sentence Text', 'National Sentence Number',
       'National Article Title', 'National Sentence Text',
       'Text Difference Tracked', 'Difference Type', 'Variation',
       'Variation Label', 'Exception', 'Comments',
       'National Article Title (FR)', 'National Sentence Text (FR)',
       'P/T Article Title (FR)', 'P/T Sentence Text (FR)',
       'Text Difference Tracked (FR)', 'Difference Type Updated',
       'Exception Updated', 'Variation Updated', 'Code Type'],
      dtype='object')


In [14]:
on_building_8_to_12_no_9_df = read_data(on_building_8_to_12_no_9, 'Building')

(323, 27)
Index(['Code Year', 'Province/Territory', 'Code Book', 'National Division',
       'P/T Document', 'P/T Division', 'P/T Sentence Number',
       'P/T Article Title', 'P/T Sentence Text', 'National Sentence Number',
       'National Article Title', 'National Sentence Text',
       'Text Difference Tracked', 'Difference Type', 'Variation',
       'Variation Label', 'Exception', 'Comments',
       'National Article Title (FR)', 'National Sentence Text (FR)',
       'P/T Article Title (FR)', 'P/T Sentence Text (FR)',
       'Text Difference Tracked (FR)', 'Difference Type Updated',
       'Exception Updated', 'Variation Updated', 'Code Type'],
      dtype='object')


In [15]:
pei_building_df = read_data(pei_building, 'Building')

(38, 34)
Index(['Code Year', 'Province/Territory', 'Code Book', 'National Division',
       'National Sentence Number', 'National Article Title',
       'National Article Title (FR)', 'National Sentence Number (FR)',
       'National Sentence Text', 'P/T Document', 'Matched P/T Division',
       'Matched P/T Sentence Number', 'Matched P/T Article Title',
       'Matched P/T Article Title (FR)', 'Matched P/T Sentence Number (FR)',
       'Matched P/T Sentence Text', 'Text Difference Tracked',
       'Difference Type', 'Variation?', 'Variation Label', 'Exception?',
       'Comments', 'Code Part', 'Code Article', 'Code Section',
       'Code Subsection', 'Code Sentence', 'National Sentence Text (FR)',
       'Matched P/T Sentence Text (FR)', 'Text Difference Tracked (FR)',
       'Difference Type Updated?', 'Exception Updated?', 'Variation Updated?',
       'Code Type'],
      dtype='object')


In [16]:
plumbing_df = read_data(plumbing, 'Plumbing')

(849, 27)
Index(['Code Year', 'Province/Territory', 'Code Book', 'National Division',
       'National Sentence Number', 'National Article Title',
       'National Sentence Text', 'National Article Title (FR)',
       'National Sentence Text (FR)', 'P/T Document', 'P/T Division',
       'P/T Sentence Number', 'P/T Article Title', 'P/T Sentence Text',
       'P/T Sentence Text (FR)', 'Difference Type', 'Variation',
       'Variation Label', 'Text Difference Tracked', 'P/T Article Title (FR)',
       'Exception', 'Comments', 'Text Difference Tracked (FR)',
       'Difference Type Updated', 'Exception Updated', 'Variation Updated',
       'Code Type'],
      dtype='object')


In [17]:
qc_building_df = read_data(qc_building, 'Building')

(1305, 27)
Index(['Code Year', 'Province/Territory', 'Code Book', 'National Division',
       'National Sentence Number', 'National Article Title',
       'National Sentence Text', 'National Article Title (FR)',
       'National Sentence Text (FR)', 'P/T Document', 'P/T Division',
       'P/T(Sentence(Number', 'P/T Article Title', 'P/T Sentence Text',
       'P/T Article Title (FR)', 'P/T Sentence Text (FR)', 'Difference Type',
       'Variation', 'Variation Label', 'Exception', 'Comments',
       'Text Difference Tracked', 'Text Difference Tracked (FR)',
       'Difference Type Updated', 'Exception Updated', 'Variation Updated',
       'Code Type'],
      dtype='object')


In [18]:
sk_building_df = read_data(sk_building, 'Building')

(35, 34)
Index(['Code Year', 'Province/Territory', 'Code Book', 'National Division',
       'National Sentence Number', 'National Article Title',
       'National Article Title (FR)', 'National Sentence Number (FR)',
       'National Sentence Text', 'P/T Document', 'Matched P/T Division',
       'Matched P/T Sentence Number', 'Matched P/T Article Title',
       'Matched P/T Article Title (FR)', 'Matched P/T Sentence Number (FR)',
       'Matched P/T Sentence Text', 'Text Difference Tracked',
       'Difference Type', 'Variation?', 'Variation Label', 'Exception?',
       'Comments', 'Code Part', 'Code Article', 'Code Section',
       'Code Subsection', 'Code Sentence', 'National Sentence Text (FR)',
       'Matched P/T Sentence Text (FR)', 'Text Difference Tracked (FR)',
       'Difference Type Updated?', 'Exception Updated?', 'Variation Updated?',
       'Code Type'],
      dtype='object')


#### Checking if all the columns of the dataframes, with the same number of columns, are the same

##### Dataframes with 34 columns

In [19]:
# print(alberta_building_df.columns == bc_buildingB_df.columns )
# print(bc_buildingB_df.columns == pei_building_df.columns)
# print(pei_building_df.columns == sk_building_df.columns)

All the dataframes with 34 columns are the same.

##### Dataframes with 27 columns

In [20]:
# print(bc_buildingA_df.columns == energy_df.columns)
# print(energy_df.columns == fire_df.columns)
# print(fire_df.columns == nl_building_df.columns)
# These have the same columns /\



# These do not have the same columns \/
# print(nl_building_df.columns == on_building_1_and_3_df.columns)
# print(on_building_1_and_3_df.columns == on_building_4_to_7_df.columns)
# print(on_building_4_to_7_df.columns == on_building_8_to_12_no_9_df.columns)
# print(on_building_8_to_12_no_9_df.columns == on_building_9_df.columns)
# print(on_building_9_df.columns == plumbing_df.columns)
# print(plumbing_df.columns == qc_building_df.columns)


# Get the column order from nl_building_df
column_order = nl_building_df.columns

# Reindex the columns of the other dataframes to match the column order of nl_building_df
on_building_1_and_3_df = on_building_1_and_3_df.reindex(columns=column_order)
on_building_4_to_7_df = on_building_4_to_7_df.reindex(columns=column_order)
on_building_8_to_12_no_9_df = on_building_8_to_12_no_9_df.reindex(columns=column_order)
on_building_9_df = on_building_9_df.reindex(columns=column_order)
plumbing_df = plumbing_df.reindex(columns=column_order)
qc_building_df = qc_building_df.reindex(columns=column_order)

# Check if the columns are the same
# print(nl_building_df.columns == on_building_1_and_3_df.columns)
# print(on_building_1_and_3_df.columns == on_building_4_to_7_df.columns)
# print(on_building_4_to_7_df.columns == on_building_8_to_12_no_9_df.columns)
# print(on_building_8_to_12_no_9_df.columns == on_building_9_df.columns)
# print(on_building_9_df.columns == plumbing_df.columns)
# print(plumbing_df.columns == qc_building_df.columns)

#### Combining all the dataframes into *variation_df*

##### DataFrame with 34 columns

In [21]:
variation_df_1 = pd.concat([alberta_building_df, bc_buildingB_df, pei_building_df, sk_building_df], ignore_index=True)
variation_df_1.shape

(1637, 34)

##### DataFrame with 27 columns

In [22]:
variation_df_2 = pd.concat([bc_buildingA_df, energy_df, fire_df, nl_building_df, on_building_1_and_3_df, on_building_4_to_7_df, on_building_8_to_12_no_9_df, on_building_9_df, plumbing_df, qc_building_df], ignore_index=True)
variation_df_2.shape

(12731, 27)

In [23]:
# Check for the column names in the two dataframes
print(variation_df_1.columns)
print(variation_df_2.columns)


Index(['Code Year', 'Province/Territory', 'Code Book', 'National Division',
       'National Sentence Number', 'National Article Title',
       'National Article Title (FR)', 'National Sentence Number (FR)',
       'National Sentence Text', 'P/T Document', 'Matched P/T Division',
       'Matched P/T Sentence Number', 'Matched P/T Article Title',
       'Matched P/T Article Title (FR)', 'Matched P/T Sentence Number (FR)',
       'Matched P/T Sentence Text', 'Text Difference Tracked',
       'Difference Type', 'Variation?', 'Variation Label', 'Exception?',
       'Comments', 'Code Part', 'Code Article', 'Code Section',
       'Code Subsection', 'Code Sentence', 'National Sentence Text (FR)',
       'Matched P/T Sentence Text (FR)', 'Text Difference Tracked (FR)',
       'Difference Type Updated?', 'Exception Updated?', 'Variation Updated?',
       'Code Type'],
      dtype='object')
Index(['Code Year', 'Province/Territory', 'Code Book', 'National Division',
       'National Sentence Numb

##### Changing the names of columns in the *variation_df_1* dataframe to match with those in the *variation_df_2* dataframe

In [24]:
# Removing the 'Matched ' and '?' from the column names
variation_df_1.columns = variation_df_1.columns.str.replace('Matched ', '') \
    .str.replace('?', '')

len(variation_df_2.columns)

27

##### Combining the *variation_df_1* and *variation_df_2* into one dataframe (filling the new columns with Nan values)

In [25]:
variation_df = pd.concat([variation_df_1, variation_df_2], axis=0, ignore_index=True)
variation_df.head()

Unnamed: 0,Code Year,Province/Territory,Code Book,National Division,National Sentence Number,National Article Title,National Article Title (FR),National Sentence Number (FR),National Sentence Text,P/T Document,...,Code Section,Code Subsection,Code Sentence,National Sentence Text (FR),P/T Sentence Text (FR),Text Difference Tracked (FR),Difference Type Updated,Exception Updated,Variation Updated,Code Type
0,2015.0,AB,NBC,Div A,1.1.1.1.(1),Application of this Code,,,This Code applies to any one or more of the fo...,NBC AB2019,...,1.1,1.1.1,1.1.1.1.(1),,,,,,,Building
1,2015.0,AB,NBC,Div A,1.1.1.1.(3),Application of this Code,,,,NBC AB2019,...,1.1,1.1.1,1.1.1.1.(3),,,,,,,Building
2,2015.0,AB,NBC,Div A,,,,,,NBC AB2019,...,1.1,1.1.1,1.1.1.1.(3),,,,,,,Building
3,2015.0,AB,NBC,Div A,,,,,,NBC AB2019,...,1.1,1.1.1,1.1.1.1.(4),,,,,,,Building
4,2015.0,AB,NBC,Div A,,,,,,NBC AB2019,...,1.1,1.1.1,1.1.1.1.(5),,,,,,,Building


In [26]:
variation_df.value_counts('Province/Territory')

Province/Territory
ON     9250
QC     2626
AB     1109
BC      967
NL      206
NS       83
SK       77
PE       38
NU        8
PEI       3
Name: count, dtype: int64

Let us change the *PEI* to *PE*

In [27]:
variation_df['Province/Territory'].replace({'PEI': 'PE'}, inplace=True)
print(variation_df.value_counts('Province/Territory'))
print(f'The shape of the variation_df is {variation_df.shape}')

Province/Territory
ON    9250
QC    2626
AB    1109
BC     967
NL     206
NS      83
SK      77
PE      41
NU       8
Name: count, dtype: int64
The shape of the variation_df is (14368, 34)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  variation_df['Province/Territory'].replace({'PEI': 'PE'}, inplace=True)


##### Saving the combined data into a csv file

(Please uncomment the code cell below to do that)

In [28]:
# variation_df.to_csv('./Data/code-variations-2015/Full 2015 Variation Data.csv', index=False)

For now, we will be working with Division B since most of the sentences are not missing.

##### Checking for the column names in variation_df and the value counts of P/T Division

In [29]:
variation_df['P/T Division'].value_counts()

P/T Division
Div B                     13402
obc2019_SB-10_Div3_Ch3      620
Div A                       109
obc2019_SB-10_Div3_Ch1       21
NECB2017_DivB                 1
Name: count, dtype: int64

In [30]:
variation_df['National Division'].value_counts()

National Division
Div B    14256
Div A      107
Div C        5
Name: count, dtype: int64

In [31]:
len(variation_df[variation_df['P/T Division'].isna()])


215

There are about 215 with no missing 'P/T Division' values. We will ignore the 'P/T Division' column since it is the same as 'National Division'.

### Isolating the rows with Division B

In [32]:
variation_df_B = variation_df[variation_df['National Division'] == 'Div B']
variation_df_B.shape

(14256, 34)

### Splitting the 2015 variation data based on the Province/Territory

##### Alberta

We must also check for the non-NaN National sentence text values in each of these P/T to make sure all the sentences are present in each of them.

In [33]:
ab_df = variation_df_B[variation_df_B['Province/Territory'] == 'AB']
print(ab_df.shape)

(1037, 34)


##### British Columbia

In [34]:
bc_df = variation_df_B[variation_df_B['Province/Territory'] == 'BC']
print(bc_df.shape)

(942, 34)


##### Newfoundland and Labrador 
(We won't be using NL since it does not exist in the full code dataset)

In [35]:
nl_df = variation_df_B[variation_df_B['Province/Territory'] == 'NL']
print(nl_df.shape)

(206, 34)


##### Nova Scotia

In [36]:
ns_df = variation_df_B[variation_df_B['Province/Territory'] == 'NS']
print(ns_df.shape)

(83, 34)


##### Nunavut

In [37]:
nu_df = variation_df_B[variation_df_B['Province/Territory'] == 'NU']
print(nu_df.shape)

(8, 34)


##### Ontario

In [38]:
on_df = variation_df_B[variation_df_B['Province/Territory'] == 'ON']
print(on_df.shape)

(9245, 34)


##### Prince Edward Island

In [39]:
pe_df = variation_df_B[variation_df_B['Province/Territory'] == 'PE']
print(pe_df.shape)

(41, 34)


##### Quebec 
(Won't be using QC for now since it is in French)

In [40]:
qc_df = variation_df_B[variation_df_B['Province/Territory'] == 'QC']
print(qc_df.shape)

(2626, 34)


##### Saskatchewan

In [41]:
sk_df = variation_df_B[variation_df_B['Province/Territory'] == 'SK']
print(sk_df.shape)

(67, 34)


### Reading the Full code data

In [42]:
national_2015_df = pd.read_excel(full_national_2015)
pt_2015_df = pd.read_excel(full_pt_2015)

In [43]:
print(pt_2015_df.columns)
print(national_2015_df.columns)

Index(['DOCTYPE', 'PT', 'Code Year', 'Code Book', 'Division', 'Language',
       'DOCID', 'DIVISION', 'PT Sentence Number', 'ARTICLE_TITLE',
       'PT Sentence Text', 'PARTNUM', 'SECTIONNUM', 'SUBSECTIONNUM',
       'ARTICLENUM', 'SENTENCENUM'],
      dtype='object')
Index(['ID', 'SEQ', 'DOCTYPE', 'DOCID', 'IDWITHINDOC', 'DIVISION', 'PROVISION',
       'ARTICLE_TITLE', 'FRAG_DOCUMENT', 'FRAG_DOCUMENT_NOWHITESPACE',
       'WORDCOUNT', 'PARTNUM', 'SECTIONNUM', 'SUBSECTIONNUM', 'ARTICLENUM',
       'SENTENCENUM'],
      dtype='object')


### Isolating rows from full code data with Division B

In [44]:
national_2015_df['DIVISION'].value_counts()

DIVISION
B    13863
C      290
A      180
Name: count, dtype: int64

In [45]:
pt_2015_df['DIVISION'].value_counts()

DIVISION
B    134173
C      2700
A      1625
Name: count, dtype: int64

In [46]:
national_2015_df_B = national_2015_df[national_2015_df['DIVISION'] == 'B']
pt_2015_df_B = pt_2015_df[pt_2015_df['DIVISION'] == 'B']

In [47]:
print("Shape of the dataframes")
print(f"P/T Codes: {pt_2015_df_B.shape}")
print(f"National Codes: {national_2015_df_B.shape}\n")

print("Number of missing sentences in")
print(f"P/T Codes: {pt_2015_df_B['PT Sentence Text'].isna().sum()}")
print(f"National Codes: {national_2015_df_B['FRAG_DOCUMENT'].isna().sum()}")

Shape of the dataframes
P/T Codes: (134173, 16)
National Codes: (13863, 16)

Number of missing sentences in
P/T Codes: 62664
National Codes: 6684


#### Removing the empty sentences and storing the sentence texts in a dataframe

In [48]:
pt_2015_df_B.dropna(subset=['PT Sentence Text'], inplace=True)
national_2015_df_B.dropna(subset=['FRAG_DOCUMENT'], inplace=True)

print("Number of text sentences in")
print(f'P/T Codes: {pt_2015_df_B.shape[0]}')
print(f'National Codes: {national_2015_df_B.shape[0]}')

Number of text sentences in
P/T Codes: 71509
National Codes: 7179


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pt_2015_df_B.dropna(subset=['PT Sentence Text'], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  national_2015_df_B.dropna(subset=['FRAG_DOCUMENT'], inplace=True)


##### Splitting the national sentence texts into train and test sets

In [66]:
national_2015_B_unique = national_2015_df_B.drop_duplicates('FRAG_DOCUMENT')

national_train, national_test = train_test_split(national_2015_B_unique, test_size=0.2, random_state=SEED)
national_train_sentences = national_train['FRAG_DOCUMENT']
national_test_sentences = national_test['FRAG_DOCUMENT']

print(f"Train: {national_train.shape}")
print(f"Test: {national_test.shape}")
print(f"Common sentences between the National train/test: {set(national_train_sentences) & set(national_test_sentences)}")

Train: (5679, 16)
Test: (1420, 16)
Common sentences between the National train/test: set()


In [67]:
national_2015_df_B.shape

(7179, 16)

##### Checking for the duplicates in the National sentence texts

In [84]:
# duplicate_values = national_2015_df_B['FRAG_DOCUMENT'][national_2015_df_B.duplicated(subset='FRAG_DOCUMENT')]
# duplicate_values.tolist()

There are about 80 duplicate National sentence texts in the 2015 full code.

#### Splitting the 2015 full P/T codes based on individual P/T and further splitting them into train test sets based on the national_train_sentences and national_test_sentences.

##### Function to calculate the token similarity

In [71]:
def token_similarity(s1, s2):
    if isinstance(s1, str) and isinstance(s2, str):
        words1 = s1.strip().split()
        words2 = s2.strip().split()
        # common_words = set(words1) & set(words2)
        
        common_words = [word for word in words1 if word in words2]
        similarity = len(common_words) / max(len(words1), len(words2))
        return similarity >= 0.90
    else:
        return False

##### Function to split individual Province/Territories data into train/test sets and save them as csv files

In [86]:
def split_and_save_data(df, string):

    # Creating a duplicate dataframe to work with
    ddf = df

    # Isolating the train/test sentences in full national code data with exact match and removing them from the dataframe
    train = ddf[ddf['National Sentence Text'].isin(national_train_sentences)]
    ddf = ddf[~ddf.index.isin(train.index)]

    test = ddf[ddf['National Sentence Text'].isin(national_test_sentences)]
    ddf = ddf[~ddf.index.isin(test.index)]


    # Checking for sentences with a 90% match in the train and test data and removing them from the dataframe
    train = pd.concat(train, ddf[ddf['National Sentence Text'].apply(lambda x: any(token_similarity(x, s) for s in national_train_sentences))], ignore_index=True)
    ddf = ddf[~ddf.index.isin(train.index)]
    
    test = pd.concat(test, ddf[ddf['National Sentence Text'].apply(lambda x: any(token_similarity(x, s) for s in national_test_sentences))], ignore_index=True)
    ddf = ddf[~ddf.index.isin(test.index)]


    # Checking for empty National Sentence Texts and combining all three dataframes
    empty = ddf[ddf['National Sentence Text'].isna()]
    main = pd.concat([train, test, empty])
    other_national = ddf[~ddf.index.isin(main.index)]

    # # Isolating the national sentence texts in the variations data but not in the full data
    #  other_national = df[~df.index.isin(main.index)]

    empty_train = pd.DataFrame()
    empty_test = pd.DataFrame()
    
    if empty.shape[0] != 0:
        total_train = int(np.ceil(0.8 * main.shape[0]))
        total_test = main.shape[0] - total_train

        empty_train_len = total_train - train.shape[0]
        empty_test_len = total_test - test.shape[0]

        empty_train, empty_test = train_test_split(empty, train_size=empty_train_len, test_size=empty_test_len, random_state=SEED)

    train_set = pd.concat([train, empty_train])
    test_set = pd.concat([test, empty_test])

    print(f"{string}")
    print(f"Full Data: {df.shape[0]}")
    print(f"Train: {train_set.shape[0]}")
    print(f"Test: {test_set.shape[0]}")
    print(f"National sentences in variations data but not in full data: {other_national.shape[0]}")

    # Check if there are any common sentences between the train and test data
    print(f"Common sentences between train and test: {(set(train['National Sentence Text']) & set(test['National Sentence Text']))}")

    # Saving the train and test dataframes as csv files
    train_set.to_csv(f'./Data/new-train-test-sets/{string} Train.csv', index=False)
    test_set.to_csv(f'./Data/new-train-test-sets/{string} Test.csv', index=False)

    return train_set, test_set, other_national

In [73]:
ab_train, ab_test, ab_other = split_and_save_data(ab_df, "Alberta")

Alberta
Full Data: 1037
Train: 650
Test: 162
National sentences in variations data but not in full data: 226
Common sentences between train and test: {'The building referred to in Sentence 3.2.2.81.(1) shall be of noncombustible construction, and floor assemblies shall be fire separations with a fire-resistance rating not less than 1 h, mezzanines shall have a fire-resistance rating not less than 1 h, roof assemblies shall have a fire-resistance rating not less than 1 h, and loadbearing walls, columns and arches shall have a fire-resistance rating not less than that required for the supported assembly.'}


In [369]:
ab_train['National Sentence Text']

41     Except as permitted by Articles 3.2.4.10. and ...
69     Except as permitted by Sentence 3.2.7.9.(3), t...
75     Except as otherwise stated in this Section, ai...
76           Non-fixed seating shall conform to the NFC.
85     Where Class IA or IB liquids specified in Subs...
                             ...                        
176                                                  NaN
233                                                  NaN
397                                                  NaN
645                                                  NaN
229                                                  NaN
Name: National Sentence Text, Length: 723, dtype: object

In [367]:
bc_train, bc_test, bc_other = split_and_save_data(bc_df, "British Columbia")

InvalidParameterError: The 'test_size' parameter of train_test_split must be a float in the range (0.0, 1.0), an int in the range [1, inf) or None. Got -7 instead.

In [305]:
ns_train, ns_test, ns_other = split_and_save_data(ns_df, "Nova Scotia")

Nova Scotia
Full Data: 83
Train: 60
Test: 15
National sentences in variations data but not in full data: 8
Common sentences between train and test: 0


In [306]:
nu_train, nu_test, nu_other = split_and_save_data(nu_df, "Nunavut")

Nunavut
Full Data: 8
Train: 6
Test: 2
National sentences in variations data but not in full data: 0
Common sentences between train and test: 0


In [307]:
on_train, on_test, on_other = split_and_save_data(on_df, "Ontario")

Ontario
Full Data: 9245
Train: 4992
Test: 1247
National sentences in variations data but not in full data: 3006
Common sentences between train and test: 0


In [308]:
pe_train, pe_test, pe_other = split_and_save_data(pe_df, "Prince Edward Island")

Prince Edward Island
Full Data: 41
Train: 31
Test: 7
National sentences in variations data but not in full data: 3
Common sentences between train and test: 0


In [309]:
sk_train, sk_test, sk_other = split_and_save_data(sk_df, "Saskatchewan")

Saskatchewan
Full Data: 67
Train: 32
Test: 7
National sentences in variations data but not in full data: 28
Common sentences between train and test: 0


In [343]:
ns_other['National Sentence Text']

1655     Buildings shall comply with\na) the prescripti...
3381     1) The maximum permissible occupant load for a...
3382     2) The number of occupants permitted to enter ...
3390                                     Display Fireworks
3391     1) The handling and discharge of fireworks sha...
3393     1) When any portion of a fire protection syste...
12214     A soil-or-waste pipe shall be of a size not l...
12218    Systems for solar heating of potable water sha...
Name: National Sentence Text, dtype: object