In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
original_data = pd.read_csv('HKhike.csv', index_col = 0)

In [3]:
data = original_data.copy()

In [4]:
data.head()

Unnamed: 0_level_0,type,length_km,duration_hour,difficulty,region,type_of_view,rating,elevation_gain
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
MacLehose Trail (Section 1) Pak Tam Chung to Long Ke,Long-distance Trail,10.6,3.0,Hard,Sai Kung,"beach, lake",4.5/5 (AllTrails),"1,250 ft"
MacLehose Trail (Section 2) Long Ke to Pak Tam Au,Long-distance Trail,13.5,5.0,Hard,Sai Kung,"beauty of the coast of Hong Kong, Sai Wan beac...",4.7/5 (AllTrails),"2,168 ft"
MacLehose Trail (Section 3) Pak Tam Au to Kei Ling Ha,Long-distance Trail,10.2,4.0,Hard,Sai Kung,"forests, views",4.5/5 (AllTrails),"1,994 ft"
MacLehose Trail (Section 4) Kei Ling Ha to Tate's Cairn,Long-distance Trail,12.7,5.0,Hard,Sai Kung,"views, wildflowers",4.5/5 (AllTrails),"2,805 ft"
MacLehose Trail (Section 5) Tate's Cairn to Tai Po Road,Long-distance Trail,10.6,4.5,Hard,Central New Territories,forests and views,4.5/5 (AllTrails),"1,791 ft"


In [5]:
data.shape

(89, 8)

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 89 entries, MacLehose Trail (Section 1) Pak Tam Chung to Long Ke to Wilson Trail overall
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   type            89 non-null     object 
 1   length_km       76 non-null     float64
 2   duration_hour   70 non-null     float64
 3   difficulty      89 non-null     object 
 4   region          89 non-null     object 
 5   type_of_view    89 non-null     object 
 6   rating          68 non-null     object 
 7   elevation_gain  72 non-null     object 
dtypes: float64(2), object(6)
memory usage: 6.3+ KB


In [7]:
data.describe()

Unnamed: 0,length_km,duration_hour
count,76.0,70.0
mean,9.242632,2.385429
std,16.468822,1.415123
min,0.23,0.25
25%,2.575,1.3125
50%,5.5,2.0
75%,9.0,3.0
max,100.0,6.0


In [8]:
data['length_km'] = data['length_km'].fillna(data['length_km'].mean())
data['duration_hour'] = data['duration_hour'].fillna(data['duration_hour'].mean())

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 89 entries, MacLehose Trail (Section 1) Pak Tam Chung to Long Ke to Wilson Trail overall
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   type            89 non-null     object 
 1   length_km       89 non-null     float64
 2   duration_hour   89 non-null     float64
 3   difficulty      89 non-null     object 
 4   region          89 non-null     object 
 5   type_of_view    89 non-null     object 
 6   rating          68 non-null     object 
 7   elevation_gain  72 non-null     object 
dtypes: float64(2), object(6)
memory usage: 6.3+ KB


In [10]:
data['elevation_gain'].describe()

count           72
unique          71
top       1,833 ft
freq             2
Name: elevation_gain, dtype: object

In [11]:
#Extract the numeric value, convert meters to feet if necessary
import re #Regular expression => search for and extract patterns


def extract_elevation(text):
    if pd.isna(text) or not isinstance(text, str): #if text is null value or text is not string => return NaN
        return pd.NA
    # Match feet
    match_ft = re.search(r'(\d{1,3}(?:,\d{3})*(?:\.\d+)?)\s*ft', text)
    # Search for a pattern representing elevation in feet:
        # \d{1,3}: Matches 1 to 3 digits(e.g, '123' or '12')
        # (?:,\d{3})*: Matches zero or more groups of a comma followed by 3 digits (e.g., ",000")
        # (?:\.\d+)?: Matches the decimal places, "?" makes it optional
        # \s*: Matches 0 or more white space
        # "ft": Matches ft
    if match_ft:
        return float(match_ft.group(1).replace(',', ''))
        # Captures the number, remoes the commas from the number string (e.g, 1,234 to 1234) and converts it to float

    # Match meters
    match_m = re.search(r'(\d{1,3}(?:,\d{3})*(?:\.\d+)?)\s*m(?:eters)?', text)
        # Same logic as above 

    if match_m:
        return float(match_m.group(1).replace(',', '')) * 3.28084
        # Same logic as above 
    return pd.NA
        # If neither the feet nor meters pattern is matched, returns pd.NA to indicate no valid elevation was found.

data['elevation_gain'] = data['elevation_gain'].apply(extract_elevation)

In [12]:
data['elevation_gain'].info()

<class 'pandas.core.series.Series'>
Index: 89 entries, MacLehose Trail (Section 1) Pak Tam Chung to Long Ke to Wilson Trail overall
Series name: elevation_gain
Non-Null Count  Dtype 
--------------  ----- 
72 non-null     object
dtypes: object(1)
memory usage: 1.4+ KB


In [13]:
data['elevation_gain'] = data['elevation_gain'].astype('Float64')
data['elevation_gain'] = data['elevation_gain'].fillna(data['elevation_gain'].mean())
data['elevation_gain']

name
MacLehose Trail (Section 1) Pak Tam Chung to Long Ke              1250.0
MacLehose Trail (Section 2) Long Ke to Pak Tam Au                 2168.0
MacLehose Trail (Section 3) Pak Tam Au to Kei Ling Ha             1994.0
MacLehose Trail (Section 4) Kei Ling Ha to Tate's Cairn           2805.0
MacLehose Trail (Section 5) Tate's Cairn to Tai Po Road           1791.0
                                                                  ...   
Hong Kong Trail (Section 6) Mount Parker Road to Tai Tam Road      180.0
MacLehose Trail overall                                          16571.0
Hong Kong Trail overall                                           5524.0
Lantau Trail overall                                             10036.0
Wilson Trail overall                                             17740.0
Name: elevation_gain, Length: 89, dtype: Float64

In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 89 entries, MacLehose Trail (Section 1) Pak Tam Chung to Long Ke to Wilson Trail overall
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   type            89 non-null     object 
 1   length_km       89 non-null     float64
 2   duration_hour   89 non-null     float64
 3   difficulty      89 non-null     object 
 4   region          89 non-null     object 
 5   type_of_view    89 non-null     object 
 6   rating          68 non-null     object 
 7   elevation_gain  89 non-null     Float64
dtypes: Float64(1), float64(2), object(5)
memory usage: 8.4+ KB


In [15]:
data['rating'].info()

<class 'pandas.core.series.Series'>
Index: 89 entries, MacLehose Trail (Section 1) Pak Tam Chung to Long Ke to Wilson Trail overall
Series name: rating
Non-Null Count  Dtype 
--------------  ----- 
68 non-null     object
dtypes: object(1)
memory usage: 3.4+ KB


In [16]:
#Extract the numeric value, convert meters to feet if necessary
import re #Regular expression => search for and extract patterns


def extract_elevation(text):
    if pd.isna(text) or not isinstance(text, str): #if text is null value or text is not string => return NaN
        return pd.NA
    # Match feet
    match_ft = re.search(r'(\d{1,3}(?:,\d{3})*(?:\.\d+)?)\s*/5', text)
    # Search for a pattern representing elevation in feet:
        # \d{1,3}: Matches 1 to 3 digits(e.g, '123' or '12')
        # (?:,\d{3})*: Matches zero or more groups of a comma followed by 3 digits (e.g., ",000")
        # (?:\.\d+)?: Matches the decimal places, "?" makes it optional
        # \s*: Matches 0 or more white space
        # "ft": Matches ft
    if match_ft:
        return float(match_ft.group(1).replace(',', ''))
        # Captures the number, remoes the commas from the number string (e.g, 1,234 to 1234) and converts it to float

    
    return pd.NA
        # If neither the feet nor meters pattern is matched, returns pd.NA to indicate no valid elevation was found.

data['rating'] = data['rating'].apply(extract_elevation)

In [17]:
data['rating'].info()

<class 'pandas.core.series.Series'>
Index: 89 entries, MacLehose Trail (Section 1) Pak Tam Chung to Long Ke to Wilson Trail overall
Series name: rating
Non-Null Count  Dtype 
--------------  ----- 
65 non-null     object
dtypes: object(1)
memory usage: 3.4+ KB


In [18]:
data['rating'] = data['rating'].astype('Float64')
data['rating'] = data['rating'].fillna(data['rating'].mean())
data['rating']

name
MacLehose Trail (Section 1) Pak Tam Chung to Long Ke                  4.5
MacLehose Trail (Section 2) Long Ke to Pak Tam Au                     4.7
MacLehose Trail (Section 3) Pak Tam Au to Kei Ling Ha                 4.5
MacLehose Trail (Section 4) Kei Ling Ha to Tate's Cairn               4.5
MacLehose Trail (Section 5) Tate's Cairn to Tai Po Road               4.5
                                                                   ...   
Hong Kong Trail (Section 6) Mount Parker Road to Tai Tam Road         4.4
MacLehose Trail overall                                               4.3
Hong Kong Trail overall                                               4.6
Lantau Trail overall                                                  4.4
Wilson Trail overall                                             4.270769
Name: rating, Length: 89, dtype: Float64

In [19]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 89 entries, MacLehose Trail (Section 1) Pak Tam Chung to Long Ke to Wilson Trail overall
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   type            89 non-null     object 
 1   length_km       89 non-null     float64
 2   duration_hour   89 non-null     float64
 3   difficulty      89 non-null     object 
 4   region          89 non-null     object 
 5   type_of_view    89 non-null     object 
 6   rating          89 non-null     Float64
 7   elevation_gain  89 non-null     Float64
dtypes: Float64(2), float64(2), object(4)
memory usage: 8.5+ KB


In [20]:
data_processed = data.to_csv('data_processed.csv')

In [21]:
'''
label/one-hot?/binary for categorical features
Test TF-IDF, BoW, and Word embeddings for type_of_view, evaluate with K-means using Silhouette Score, select TF-IDF if best.
Process type, region (one-hot encoding), difficulty (Label Encoding), normalize with StandardScaler.
Use best type to test K-means, DBSCAN, GMM, Hierarchical Clustering; compare Silhouette Scores + interpretability <=> comparable table? -> explain
Visualize best clusters with scatter plot (PCA) and word cloud.
Document: Describe experiments, comparison table, explain cluster meanings (trail recommendations).'''

'\nlabel/one-hot?/binary for categorical features\nTest TF-IDF, BoW, and Word embeddings for type_of_view, evaluate with K-means using Silhouette Score, select TF-IDF if best.\nProcess type, region (one-hot encoding), difficulty (Label Encoding), normalize with StandardScaler.\nUse best type to test K-means, DBSCAN, GMM, Hierarchical Clustering; compare Silhouette Scores + interpretability <=> comparable table? -> explain\nVisualize best clusters with scatter plot (PCA) and word cloud.\nDocument: Describe experiments, comparison table, explain cluster meanings (trail recommendations).'