# Setup

In [1]:
# Import libraries
import pandas as pd

In [2]:
# Load the datasets
df1 = pd.read_csv("../data/old_training_dataset.csv")
df2 = pd.read_csv("../data/new_training_dataset.csv")

In [3]:
df1.head()

Unnamed: 0,having_IP_Address,URL_Length,Shortining_Service,having_At_Symbol,double_slash_redirecting,Prefix_Suffix,having_Sub_Domain,SSLfinal_State,Domain_registeration_length,Favicon,...,popUpWidnow,Iframe,age_of_domain,DNSRecord,web_traffic,Page_Rank,Google_Index,Links_pointing_to_page,Statistical_report,Result
0,-1,1,1,1,-1,-1,-1,-1,-1,1,...,1,1,-1,-1,-1,-1,1,1,-1,-1
1,1,1,1,1,1,-1,0,1,-1,1,...,1,1,-1,-1,0,-1,1,1,1,-1
2,1,0,1,1,1,-1,-1,-1,-1,1,...,1,1,1,-1,1,-1,1,0,-1,-1
3,1,0,1,1,1,-1,-1,-1,1,1,...,1,1,-1,-1,1,-1,1,-1,1,-1
4,1,0,-1,1,1,-1,1,1,-1,1,...,-1,1,-1,-1,0,-1,1,1,1,1


In [4]:
df2.head()

Unnamed: 0,URLLength,DomainLength,IsDomainIP,TLD,URLSimilarityIndex,CharContinuationRate,TLDLegitimateProb,URLCharProb,TLDLength,NoOfSubDomain,...,Pay,Crypto,HasCopyrightInfo,NoOfImage,NoOfCSS,NoOfJS,NoOfSelfRef,NoOfEmptyRef,NoOfExternalRef,label
0,31,24,0,com,100.0,1.0,0.522907,0.061933,3,1,...,0,0,1,34,20,28,119,0,124,1
1,23,16,0,de,100.0,0.666667,0.03265,0.050207,2,1,...,0,0,1,50,9,8,39,0,217,1
2,29,22,0,uk,100.0,0.866667,0.028555,0.064129,2,2,...,0,0,1,10,2,7,42,2,5,1
3,26,19,0,com,100.0,1.0,0.522907,0.057606,3,1,...,1,1,1,3,27,15,22,1,31,1
4,33,26,0,org,100.0,1.0,0.079963,0.059441,3,1,...,1,0,1,244,15,34,72,1,85,1


There are a few features that appear to be similar based on their names.

We can expect some issues because the older dataset has only binary features. For example, the first visible one is the 'URL_Length' which in the newer dataset has values such as 31, 23, 29... However, we plan to check the final distribution to see if normalization/scaling is needed.

In [5]:
# Rename columns in df2 to match df1
df2 = df2.rename(columns={
    "URLLength": "URL_Length",
    "DomainLength": "Domain_registeration_length",
    "NoOfSubDomain": "having_Sub_Domain",
    "HasFavicon": "Favicon",
    "NoOfURLRedirect": "Redirect",
    "NoOfiFrame": "Iframe"
})

In [6]:
df2.head()

Unnamed: 0,URL_Length,Domain_registeration_length,IsDomainIP,TLD,URLSimilarityIndex,CharContinuationRate,TLDLegitimateProb,URLCharProb,TLDLength,having_Sub_Domain,...,Pay,Crypto,HasCopyrightInfo,NoOfImage,NoOfCSS,NoOfJS,NoOfSelfRef,NoOfEmptyRef,NoOfExternalRef,label
0,31,24,0,com,100.0,1.0,0.522907,0.061933,3,1,...,0,0,1,34,20,28,119,0,124,1
1,23,16,0,de,100.0,0.666667,0.03265,0.050207,2,1,...,0,0,1,50,9,8,39,0,217,1
2,29,22,0,uk,100.0,0.866667,0.028555,0.064129,2,2,...,0,0,1,10,2,7,42,2,5,1
3,26,19,0,com,100.0,1.0,0.522907,0.057606,3,1,...,1,1,1,3,27,15,22,1,31,1
4,33,26,0,org,100.0,1.0,0.079963,0.059441,3,1,...,1,0,1,244,15,34,72,1,85,1


This step of selecting features is important as we want to ensure consistency between the two datasets.

Some features had similar meaning but different names.
- 'NoOfURLRedirect' was renamed to 'Redirect' despite the presence of 'NoOfSelfRedirect' because it aligns better with the idea of how many times the user is redirected through external URLs.
- 'DomainLength' was mapped to 'Domain_registeration_length'
- And the rest was renamed to their respective equivalents as well, based on their descriptions and role

In [7]:
# To keep only common columns, we decided to simply take the intersection of their sets
common_columns = list(set(df1.columns) & set(df2.columns))
df1_cc = df1[common_columns].copy()
df2_cc = df2[common_columns].copy()

In [8]:
# Combine datasets
merged_df = pd.concat([df1_cc, df2_cc], ignore_index=True) # We included ignore_index because Pandas preserves the original row indices, and we want to reindex everything from scratch for a cleaner version

In [10]:
# Preview
print(merged_df.head())

   Redirect  Iframe  URL_Length  Domain_registeration_length  Favicon  \
0         0       1           1                           -1        1   
1         0       1           1                           -1        1   
2         0       1           0                           -1        1   
3         0       1           0                            1        1   
4         0       1           0                           -1        1   

   having_Sub_Domain  
0                 -1  
1                  0  
2                 -1  
3                 -1  
4                  1  


In [12]:
# Save as csv
merged_df.to_csv("../data/merged_dataset.csv", index=False)

# Explore

In [13]:
merged_df.head()

Unnamed: 0,Redirect,Iframe,URL_Length,Domain_registeration_length,Favicon,having_Sub_Domain
0,0,1,1,-1,1,-1
1,0,1,1,-1,1,0
2,0,1,0,-1,1,-1
3,0,1,0,1,1,-1
4,0,1,0,-1,1,1
