In [282]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [283]:
original_data = pd.read_csv('HKhike.csv')

In [284]:
data = original_data.copy()

In [285]:
data.head()

Unnamed: 0,name,type,length_km,duration_hour,difficulty,region,type_of_view,rating,elevation_gain
0,MacLehose Trail (Section 1) Pak Tam Chung to L...,Long-distance Trail,10.6,3.0,Hard,Sai Kung,"beach, lake",4.5/5 (AllTrails),"1,250 ft"
1,MacLehose Trail (Section 2) Long Ke to Pak Tam Au,Long-distance Trail,13.5,5.0,Hard,Sai Kung,"beauty of the coast of Hong Kong, Sai Wan beac...",4.7/5 (AllTrails),"2,168 ft"
2,MacLehose Trail (Section 3) Pak Tam Au to Kei ...,Long-distance Trail,10.2,4.0,Hard,Sai Kung,"forests, views",4.5/5 (AllTrails),"1,994 ft"
3,MacLehose Trail (Section 4) Kei Ling Ha to Tat...,Long-distance Trail,12.7,5.0,Hard,Sai Kung,"views, wildflowers",4.5/5 (AllTrails),"2,805 ft"
4,MacLehose Trail (Section 5) Tate's Cairn to Ta...,Long-distance Trail,10.6,4.5,Hard,Central New Territories,forests and views,4.5/5 (AllTrails),"1,791 ft"


In [286]:
data.shape

(89, 9)

In [287]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89 entries, 0 to 88
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   name            89 non-null     object 
 1   type            89 non-null     object 
 2   length_km       76 non-null     float64
 3   duration_hour   70 non-null     float64
 4   difficulty      89 non-null     object 
 5   region          89 non-null     object 
 6   type_of_view    89 non-null     object 
 7   rating          68 non-null     object 
 8   elevation_gain  72 non-null     object 
dtypes: float64(2), object(7)
memory usage: 6.4+ KB


In [288]:
data.describe()

Unnamed: 0,length_km,duration_hour
count,76.0,70.0
mean,9.242632,2.385429
std,16.468822,1.415123
min,0.23,0.25
25%,2.575,1.3125
50%,5.5,2.0
75%,9.0,3.0
max,100.0,6.0


In [289]:
data['length_km'] = data['length_km'].fillna(data['length_km'].mean())
data['duration_hour'] = data['duration_hour'].fillna(data['duration_hour'].mean())

In [290]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89 entries, 0 to 88
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   name            89 non-null     object 
 1   type            89 non-null     object 
 2   length_km       89 non-null     float64
 3   duration_hour   89 non-null     float64
 4   difficulty      89 non-null     object 
 5   region          89 non-null     object 
 6   type_of_view    89 non-null     object 
 7   rating          68 non-null     object 
 8   elevation_gain  72 non-null     object 
dtypes: float64(2), object(7)
memory usage: 6.4+ KB


In [291]:
data['elevation_gain'].describe()

count           72
unique          71
top       1,833 ft
freq             2
Name: elevation_gain, dtype: object

In [292]:
#Extract the numeric value, convert meters to feet if necessary
import re #Regular expression => search for and extract patterns


def extract_elevation(text):
    if pd.isna(text) or not isinstance(text, str): #if text is null value or text is not string => return NaN
        return pd.NA
    # Match feet
    match_ft = re.search(r'(\d{1,3}(?:,\d{3})*(?:\.\d+)?)\s*ft', text)
    # Search for a pattern representing elevation in feet:
        # \d{1,3}: Matches 1 to 3 digits(e.g, '123' or '12')
        # (?:,\d{3})*: Matches zero or more groups of a comma followed by 3 digits (e.g., ",000")
        # (?:\.\d+)?: Matches the decimal places, "?" makes it optional
        # \s*: Matches 0 or more white space
        # "ft": Matches ft
    if match_ft:
        return float(match_ft.group(1).replace(',', ''))
        # Captures the number, remoes the commas from the number string (e.g, 1,234 to 1234) and converts it to float

    # Match meters
    match_m = re.search(r'(\d{1,3}(?:,\d{3})*(?:\.\d+)?)\s*m(?:eters)?', text)
        # Same logic as above 

    if match_m:
        return float(match_m.group(1).replace(',', '')) * 3.28084
        # Same logic as above 
    return pd.NA
        # If neither the feet nor meters pattern is matched, returns pd.NA to indicate no valid elevation was found.

data['elevation_gain'] = data['elevation_gain'].apply(extract_elevation)

In [293]:
data['elevation_gain'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 89 entries, 0 to 88
Series name: elevation_gain
Non-Null Count  Dtype 
--------------  ----- 
72 non-null     object
dtypes: object(1)
memory usage: 844.0+ bytes


In [294]:
data['elevation_gain'] = data['elevation_gain'].astype('Float64')
data['elevation_gain'] = data['elevation_gain'].fillna(data['elevation_gain'].mean())
data['elevation_gain']

0          1250.0
1          2168.0
2          1994.0
3          2805.0
4          1791.0
5           679.0
6          1971.0
7          2047.0
8           439.0
9          1213.0
10          515.0
11      1017.0604
12      1607.6116
13         2395.0
14         1204.0
15    1623.465187
16          741.0
17          485.0
18         1833.0
19         1414.0
20          885.0
21          295.0
22          275.0
23         1893.0
24         1131.0
25         1833.0
26      2099.7376
27         1433.0
28         2286.0
29          259.0
30          426.0
31         1571.0
32          685.0
33          944.0
34          505.0
35          767.0
36         1233.0
37          551.0
38          288.0
39          413.0
40         1381.0
41          797.0
42    1623.465187
43          990.0
44     1217.19164
45      1410.7612
46    1623.465187
47    1623.465187
48          951.0
49           95.0
50          521.0
51       131.2336
52    1623.465187
53    1623.465187
54       26.24672
55    1623

In [295]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89 entries, 0 to 88
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   name            89 non-null     object 
 1   type            89 non-null     object 
 2   length_km       89 non-null     float64
 3   duration_hour   89 non-null     float64
 4   difficulty      89 non-null     object 
 5   region          89 non-null     object 
 6   type_of_view    89 non-null     object 
 7   rating          68 non-null     object 
 8   elevation_gain  89 non-null     Float64
dtypes: Float64(1), float64(2), object(6)
memory usage: 6.5+ KB


In [296]:
data['rating'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 89 entries, 0 to 88
Series name: rating
Non-Null Count  Dtype 
--------------  ----- 
68 non-null     object
dtypes: object(1)
memory usage: 844.0+ bytes


In [297]:
#Extract the numeric value, convert meters to feet if necessary
import re #Regular expression => search for and extract patterns


def extract_elevation(text):
    if pd.isna(text) or not isinstance(text, str): #if text is null value or text is not string => return NaN
        return pd.NA
    # Match feet
    match_ft = re.search(r'(\d{1,3}(?:,\d{3})*(?:\.\d+)?)\s*/5', text)
    # Search for a pattern representing elevation in feet:
        # \d{1,3}: Matches 1 to 3 digits(e.g, '123' or '12')
        # (?:,\d{3})*: Matches zero or more groups of a comma followed by 3 digits (e.g., ",000")
        # (?:\.\d+)?: Matches the decimal places, "?" makes it optional
        # \s*: Matches 0 or more white space
        # "ft": Matches ft
    if match_ft:
        return float(match_ft.group(1).replace(',', ''))
        # Captures the number, remoes the commas from the number string (e.g, 1,234 to 1234) and converts it to float

    
    return pd.NA
        # If neither the feet nor meters pattern is matched, returns pd.NA to indicate no valid elevation was found.

data['rating'] = data['rating'].apply(extract_elevation)

In [298]:
data['rating'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 89 entries, 0 to 88
Series name: rating
Non-Null Count  Dtype 
--------------  ----- 
65 non-null     object
dtypes: object(1)
memory usage: 844.0+ bytes


In [299]:
data['rating'] = data['rating'].astype('Float64')
data['rating'] = data['rating'].fillna(data['rating'].mean())
data['rating']

0          4.5
1          4.7
2          4.5
3          4.5
4          4.5
5          4.2
6          4.5
7          4.6
8          4.1
9          4.1
10         4.5
11    4.270769
12         4.4
13         4.2
14         4.4
15    4.270769
16         4.6
17         3.6
18         4.6
19         4.3
20         4.5
21         4.2
22         4.0
23         4.6
24         4.3
25         4.3
26         4.1
27         4.2
28         4.6
29         3.8
30         4.4
31         4.5
32    4.270769
33         4.0
34         3.9
35         4.2
36         4.4
37         4.3
38         4.5
39         3.9
40         4.4
41         4.5
42    4.270769
43         4.3
44         4.6
45    4.270769
46    4.270769
47         3.7
48         3.9
49         4.4
50         3.6
51    4.270769
52    4.270769
53         3.0
54    4.270769
55    4.270769
56    4.270769
57    4.270769
58         4.6
59    4.270769
60         3.9
61         4.0
62         4.0
63         4.0
64    4.270769
65    4.270769
66        

In [300]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89 entries, 0 to 88
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   name            89 non-null     object 
 1   type            89 non-null     object 
 2   length_km       89 non-null     float64
 3   duration_hour   89 non-null     float64
 4   difficulty      89 non-null     object 
 5   region          89 non-null     object 
 6   type_of_view    89 non-null     object 
 7   rating          89 non-null     Float64
 8   elevation_gain  89 non-null     Float64
dtypes: Float64(2), float64(2), object(5)
memory usage: 6.6+ KB


In [301]:
data_processed = data.to_csv('data_processed.csv')