# Surf_Check_ML

In [50]:
# Initial imports
import pandas as pd
import hvplot.pandas
import plotly.express as px
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

In [51]:
# Load the beach_data.csv dataset.
file_path = "Resources/beaches_data.csv"
beaches_df = pd.read_csv(file_path)
beaches_df.head()

Unnamed: 0,Beach_ID,Beach_Name,Latitude,Longitude,Sunrise,Sunset,Weather_Description,Temperature,Water_Temperature,Cloud_Cover,Wind_Speed,Wave_Height
0,0,Imperial Beach,32.575831,-117.126577,07:07 AM,05:57 PM,Patchy rain possible,62,67,73,12,3.0
1,1,Border Field State Park,32.54494,-117.122341,07:07 AM,05:57 PM,Patchy rain possible,62,67,70,12,3.0
2,2,Pelican State Beach,41.992395,-124.209664,07:52 AM,06:09 PM,Light rain shower,48,52,100,4,10.5
3,3,Crescent Beach,41.728363,-124.153178,07:51 AM,06:10 PM,Light rain shower,46,51,100,6,7.9
4,4,Redwood National Park,41.20897,-124.119056,07:49 AM,06:11 PM,Patchy rain possible,47,52,86,7,9.5


In [52]:
# Renamed "Unnamed: 0" column and set it as the index
beaches_df = beaches_df.rename(columns={'Unnamed: 0': ''})
beaches_df = beaches_df.set_index('Beach_ID')
beaches_df.head()

Unnamed: 0_level_0,Beach_Name,Latitude,Longitude,Sunrise,Sunset,Weather_Description,Temperature,Water_Temperature,Cloud_Cover,Wind_Speed,Wave_Height
Beach_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,Imperial Beach,32.575831,-117.126577,07:07 AM,05:57 PM,Patchy rain possible,62,67,73,12,3.0
1,Border Field State Park,32.54494,-117.122341,07:07 AM,05:57 PM,Patchy rain possible,62,67,70,12,3.0
2,Pelican State Beach,41.992395,-124.209664,07:52 AM,06:09 PM,Light rain shower,48,52,100,4,10.5
3,Crescent Beach,41.728363,-124.153178,07:51 AM,06:10 PM,Light rain shower,46,51,100,6,7.9
4,Redwood National Park,41.20897,-124.119056,07:49 AM,06:11 PM,Patchy rain possible,47,52,86,7,9.5


In [53]:
print(beaches_df.shape)

(249, 11)


In [54]:
wave_height_df = beaches_df.filter(['Beach_Name', 'Wave_Height'], axis=1)
wave_height_df

Unnamed: 0_level_0,Beach_Name,Wave_Height
Beach_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Imperial Beach,3.0
1,Border Field State Park,3.0
2,Pelican State Beach,10.5
3,Crescent Beach,7.9
4,Redwood National Park,9.5
...,...,...
244,Pacific Beach,2.6
245,Mission Beach,2.6
246,Ocean Beach City Beach,2.6
247,Coronado Municipal Beach,3.0


In [55]:
wave_height_bins = [0, 3, 9, 12]
wave_size = ["Small (<4)", "Medium (4-9)", "Large (10-12)"]


# Categorize based on the bins.
beaches_df["Wave_Size"] = pd.cut(beaches_df["Wave_Height"], wave_height_bins, labels=wave_size)
beaches_df

Unnamed: 0_level_0,Beach_Name,Latitude,Longitude,Sunrise,Sunset,Weather_Description,Temperature,Water_Temperature,Cloud_Cover,Wind_Speed,Wave_Height,Wave_Size
Beach_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,Imperial Beach,32.575831,-117.126577,07:07 AM,05:57 PM,Patchy rain possible,62,67,73,12,3.0,Small (<4)
1,Border Field State Park,32.544940,-117.122341,07:07 AM,05:57 PM,Patchy rain possible,62,67,70,12,3.0,Small (<4)
2,Pelican State Beach,41.992395,-124.209664,07:52 AM,06:09 PM,Light rain shower,48,52,100,4,10.5,Large (10-12)
3,Crescent Beach,41.728363,-124.153178,07:51 AM,06:10 PM,Light rain shower,46,51,100,6,7.9,Medium (4-9)
4,Redwood National Park,41.208970,-124.119056,07:49 AM,06:11 PM,Patchy rain possible,47,52,86,7,9.5,Large (10-12)
...,...,...,...,...,...,...,...,...,...,...,...,...
244,Pacific Beach,32.793934,-117.256811,07:08 AM,05:57 PM,Patchy rain possible,62,67,76,11,2.6,Small (<4)
245,Mission Beach,32.773040,-117.251710,07:08 AM,05:57 PM,Patchy rain possible,62,67,76,11,2.6,Small (<4)
246,Ocean Beach City Beach,32.741633,-117.254458,07:08 AM,05:57 PM,Patchy rain possible,62,67,76,11,2.6,Small (<4)
247,Coronado Municipal Beach,32.689464,-117.186835,07:07 AM,05:57 PM,Patchy rain possible,62,67,76,12,3.0,Small (<4)


In [56]:
Wave_Size = pd.cut(beaches_df["Wave_Height"], wave_height_bins, labels=wave_size)

wave_height_df["Wave_Size"] = Wave_Size
wave_height_df

Unnamed: 0_level_0,Beach_Name,Wave_Height,Wave_Size
Beach_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Imperial Beach,3.0,Small (<4)
1,Border Field State Park,3.0,Small (<4)
2,Pelican State Beach,10.5,Large (10-12)
3,Crescent Beach,7.9,Medium (4-9)
4,Redwood National Park,9.5,Large (10-12)
...,...,...,...
244,Pacific Beach,2.6,Small (<4)
245,Mission Beach,2.6,Small (<4)
246,Ocean Beach City Beach,2.6,Small (<4)
247,Coronado Municipal Beach,3.0,Small (<4)


In [57]:
beaches_df.dtypes

Beach_Name               object
Latitude                float64
Longitude               float64
Sunrise                  object
Sunset                   object
Weather_Description      object
Temperature               int64
Water_Temperature         int64
Cloud_Cover               int64
Wind_Speed                int64
Wave_Height             float64
Wave_Size              category
dtype: object

In [58]:
for column in beaches_df.columns:
    print(f"Column {column} has {beaches_df[column].isnull().sum} null values")

Column Beach_Name has <bound method NDFrame._add_numeric_operations.<locals>.sum of Beach_ID
0      False
1      False
2      False
3      False
4      False
       ...  
244    False
245    False
246    False
247    False
248    False
Name: Beach_Name, Length: 249, dtype: bool> null values
Column Latitude has <bound method NDFrame._add_numeric_operations.<locals>.sum of Beach_ID
0      False
1      False
2      False
3      False
4      False
       ...  
244    False
245    False
246    False
247    False
248    False
Name: Latitude, Length: 249, dtype: bool> null values
Column Longitude has <bound method NDFrame._add_numeric_operations.<locals>.sum of Beach_ID
0      False
1      False
2      False
3      False
4      False
       ...  
244    False
245    False
246    False
247    False
248    False
Name: Longitude, Length: 249, dtype: bool> null values
Column Sunrise has <bound method NDFrame._add_numeric_operations.<locals>.sum of Beach_ID
0      False
1      False
2      False
3

In [59]:
#Find duplicates
print(f"Duplicate entries: {beaches_df.duplicated().sum()}")

Duplicate entries: 0


In [46]:
#Data Preprocessings; Transform String column (18.2.5)
#def change_string(Wave_Size):
#    if size == "Small":
#        return 0
#    elif size == "Medium":
#        return 1
#    else:
#        return 2
#beaches_df["Wave_Size"] = beaches_df["Wave_Size"].apply(change_string)
#beaches_df.head()

In [47]:
#Clustering using K-Means
#Initializing model with K = 3 (Wave sizes = small, medium, large)
model = KMeans(n_clusters=3, random_state=0)
model

KMeans(n_clusters=3, random_state=0)

In [24]:
#Fitting model 
#model.fit(beaches_df)

In [26]:
# Predict clusters
#predictions = model.predict(beaches_df)
#print(predictions)

In [28]:
# Get the predictions
#predictions = model.predict(beaches_df)
#print(predictions)

In [None]:
# Add a new class column to the beaches_df (18.3.2)
#beaches_df["Height_Predictction"] = model.labels_
#beaches_df.head()

In [None]:
#?? Create a scatterplot of beaches_df
#beaches_df.hvplot.scatter(x="sepal_length", y="sepal_width", by="class")

In [None]:
#Elbow Curve Method (18.4.1)

In [None]:
#Principal Component Analysis

In [55]:
# Use get_dummies() to create variables for text features.
#X = pd.get_dummies(data=crypto_df, columns=["Algorithm", "ProofType"])
#X.head(10)

Unnamed: 0,TotalCoinsMined,TotalCoinSupply,Algorithm_1GB AES Pattern Search,Algorithm_536,Algorithm_Argon2d,Algorithm_BLAKE256,Algorithm_Blake,Algorithm_Blake2S,Algorithm_Blake2b,Algorithm_C11,...,ProofType_PoW/PoS,ProofType_PoW/PoS.1,ProofType_PoW/PoW,ProofType_PoW/nPoS,ProofType_Pos,ProofType_Proof of Authority,ProofType_Proof of Trust,ProofType_TPoS,ProofType_Zero-Knowledge Proof,ProofType_dPoW/PoW
,,,,,,,,,,,,,,,,,,,,,
42,41.99995,42.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
404,1055185000.0,532000000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1337,29279420000.0,314159265359.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BTC,17927180.0,21000000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ETH,107684200.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
LTC,63039240.0,84000000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
DASH,9031294.0,22000000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
XMR,17201140.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ETC,113359700.0,210000000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [56]:
# Standardize the data with StandardScaler().
#X = StandardScaler().fit_transform(X)
#print(X)

[[-0.11710817 -0.1528703  -0.0433963  ... -0.0433963  -0.0433963
  -0.0433963 ]
 [-0.09396955 -0.145009   -0.0433963  ... -0.0433963  -0.0433963
  -0.0433963 ]
 [ 0.52494561  4.48942416 -0.0433963  ... -0.0433963  -0.0433963
  -0.0433963 ]
 ...
 [-0.09561336 -0.13217937 -0.0433963  ... -0.0433963  -0.0433963
  -0.0433963 ]
 [-0.11694817 -0.15255998 -0.0433963  ... -0.0433963  -0.0433963
  -0.0433963 ]
 [-0.11710536 -0.15285552 -0.0433963  ... -0.0433963  -0.0433963
  -0.0433963 ]]


### Deliverable 2: Reducing Data Dimensions Using PCA

In [33]:
# Using PCA to reduce dimension to three principal components.
#n_comp = 3
#pca = PCA(n_components=n_comp)
#ppca = pca.fit_transform(X)
#ppca

NameError: name 'X' is not defined

In [58]:
# Create a DataFrame with the three principal components.
#pcs_df = pd.DataFrame(ppca, columns=["PC 1", "PC 2","PC 3"], index = crypto_df.index)
#pcs_df.head(10)

Unnamed: 0,PC 1,PC 2,PC 3
,,,
42,-0.341149,1.119324,-0.253848
404,-0.324467,1.119538,-0.254071
1337,2.287932,1.535358,-0.346538
BTC,-0.160923,-1.381975,0.024595
ETH,-0.160844,-2.096176,0.131572
LTC,-0.166991,-1.046679,-0.008028
DASH,-0.387854,1.227333,-0.222178
XMR,-0.159305,-2.315098,0.110157
ETC,-0.159284,-2.096276,0.131563


### Deliverable 3: Clustering Crytocurrencies Using K-Means

#### Finding the Best Value for `k` Using the Elbow Curve

In [35]:
# Create an elbow curve to find the best value for K.
inertia = []

k = list(range(1, 11))

for i in k:
   km = KMeans(n_clusters =i, random_state=0)
   km.fit(beaches_df)
   inertia.append(km.inertia_)

elbow_data = {"k": k, "inertia": inertia}
elbow_df = pd.DataFrame(elbow_data)
elbow_df.hvplot.line(x="k", y="inertia", title="Elbow Curve", xticks=k)

ValueError: could not convert string to float: 'Imperial Beach'

Running K-Means with `k=4`

In [36]:
# Initialize the K-Means model.
model = KMeans(n_clusters=3, random_state=0)

# Fit the model
model.fit(pcs_df)

# Predict clusters
predictions = model.predict(beaches_df)
predictions

NameError: name 'pcs_df' is not defined

In [61]:
# Create a new DataFrame including predicted clusters and cryptocurrencies features.
# Concatentate the crypto_df and pcs_df DataFrames on the same columns.
clustered_df = pd.concat([crypto_df,pcs_df],axis =1)

#  Add a new column, "CoinName" to the clustered_df DataFrame that holds the names of the cryptocurrencies. 
clustered_df['CoinName'] = names_df['CoinName']

#  Add a new column, "Class" to the clustered_df DataFrame that holds the predictions.
clustered_df["Class"]= model.labels_

# Print the shape of the clustered_df
print(clustered_df.shape)
clustered_df.head(10)

(532, 9)


Unnamed: 0,Algorithm,ProofType,TotalCoinsMined,TotalCoinSupply,PC 1,PC 2,PC 3,CoinName,Class
,,,,,,,,,
42,Scrypt,PoW/PoS,41.99995,42.0,-0.341149,1.119324,-0.253848,42 Coin,0.0
404,Scrypt,PoW/PoS,1055185000.0,532000000.0,-0.324467,1.119538,-0.254071,404Coin,0.0
1337,X13,PoW/PoS,29279420000.0,314159265359.0,2.287932,1.535358,-0.346538,EliteCoin,0.0
BTC,SHA-256,PoW,17927180.0,21000000.0,-0.160923,-1.381975,0.024595,Bitcoin,3.0
ETH,Ethash,PoW,107684200.0,0.0,-0.160844,-2.096176,0.131572,Ethereum,3.0
LTC,Scrypt,PoW,63039240.0,84000000.0,-0.166991,-1.046679,-0.008028,Litecoin,3.0
DASH,X11,PoW/PoS,9031294.0,22000000.0,-0.387854,1.227333,-0.222178,Dash,0.0
XMR,CryptoNight-V7,PoW,17201140.0,0.0,-0.159305,-2.315098,0.110157,Monero,3.0
ETC,Ethash,PoW,113359700.0,210000000.0,-0.159284,-2.096276,0.131563,Ethereum Classic,3.0


### Deliverable 4: Visualizing Cryptocurrencies Results

#### 3D-Scatter with Clusters

In [62]:
# Creating a 3D-Scatter with the PCA data and the clusters
fig = px.scatter_3d(
    clustered_df,
    x="PC 1",
    y="PC 2",
    z="PC 3",
    color="Class",
    symbol="Class",
    width=800,
    hover_name="CoinName",
    hover_data=["Algorithm"],
)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()


In [63]:
# Create a table with tradable cryptocurrencies.
clustered_df.hvplot.table(sortable=True, selectable=True)

In [64]:
# Print the total number of tradable cryptocurrencies.
print(f'There are {len(clustered_df)} tradable cryptocurrencies.')

There are 532 tradable cryptocurrencies.


In [65]:
# Scaling data to create the scatter plot with tradable cryptocurrencies.
mms = MinMaxScaler().fit_transform(clustered_df[["TotalCoinSupply","TotalCoinsMined"]])
mms

array([[4.20000000e-11, 0.00000000e+00],
       [5.32000000e-04, 1.06585544e-03],
       [3.14159265e-01, 2.95755135e-02],
       ...,
       [1.40022261e-03, 9.90135079e-04],
       [2.10000000e-05, 7.37028150e-06],
       [1.00000000e-06, 1.29582282e-07]])

In [66]:
# Create a new DataFrame that has the scaled data with the clustered_df DataFrame index.
plot_df=pd.DataFrame(
data=mms, columns=['TotalCoinSupply','TotalCoinsMined'], index = clustered_df.index)

# Add the "CoinName" column from the clustered_df DataFrame to the new DataFrame.
plot_df['CoinName'] = clustered_df['CoinName'] 

# Add the "Class" column from the clustered_df DataFrame to the new DataFrame. 
plot_df['Class'] = clustered_df['Class']
plot_df.head(10)

Unnamed: 0,TotalCoinSupply,TotalCoinsMined,CoinName,Class
,,,,
42,4.2e-11,0.0,42 Coin,0.0
404,0.000532,0.001066,404Coin,0.0
1337,0.3141593,0.029576,EliteCoin,0.0
BTC,2.1e-05,1.8e-05,Bitcoin,3.0
ETH,0.0,0.000109,Ethereum,3.0
LTC,8.4e-05,6.4e-05,Litecoin,3.0
DASH,2.2e-05,9e-06,Dash,0.0
XMR,0.0,1.7e-05,Monero,3.0
ETC,0.00021,0.000115,Ethereum Classic,3.0


In [67]:
# Create a hvplot.scatter plot using x="TotalCoinsMined" and y="TotalCoinSupply".
plot_df.hvplot.scatter(x="TotalCoinsMined", y="TotalCoinSupply", by="Class")
