In [None]:
%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [None]:
planets = pd.read_csv('planet.csv')

In [None]:
planets.head()

Looking for correlated features

In [None]:
fig = plt.figure(figsize=(7, 7))
numeric_planets = planets.select_dtypes(include=['number'])  # Select only numeric columns
sns.heatmap(
    numeric_planets.corr(), 
    center=0, square=True, annot=True
)

Looking at Orbit shape

In [None]:
planets.eccentricity.hist()
plt.xlabel('eccentricity')
plt.ylabel('frequency')
plt.title('Exoplanet Eccentricities')

In [None]:
planets.eccentricity.min(), planets.eccentricity.max()

Understanding the semi-major axis

In [None]:
from matplotlib.patches import Ellipse

fig, axes = plt.subplots(1, 1)
orbit = Ellipse(xy=(0, 0), width=2, height=1.5, facecolor='lightblue')
axes.add_artist(orbit)
axes.plot([-1, 0], [0, 0])
axes.annotate(
    'semi-major axis', 
    xy=(-0.5, 0), 
    xytext=(-0.8, -0.2), 
    arrowprops=dict(arrowstyle='wedge')
)
axes.annotate(
    'orbit center', 
    xy=(0, 0), 
    xytext=(-0.21, 0.115), 
    arrowprops=dict(arrowstyle='wedge')
)
plt.plot(
    [-.75], [0.5], 
    marker='o', markersize=4, 
    color='green', label='planet'
)
plt.plot(
    [0], [0], 
    marker='o', markersize=10, 
    color='orange', label='star'
)
plt.xlim(-1.25, 1.25)
plt.ylim(-1.25, 1.25)
plt.legend()

Checking data values

In [None]:
planets[['period', 'eccentricity', 'semimajoraxis', 'mass']].info()

In [None]:
planets[['period', 'eccentricity', 'semimajoraxis', 'mass']].dropna().shape

In [None]:
planets[['period', 'eccentricity', 'semimajoraxis', 'mass']].describe()

Visualizing Year and Orbit Length

In [None]:
ax = sns.scatterplot(x=planets.semimajoraxis, y=planets.period, hue=planets.list, alpha=0.5)
plt.title('Period vs. Semimajoraxis')
ax.legend(bbox_to_anchor=(1, 0.77))  # move legend to the right of the plot


In [None]:
fig, axes = plt.subplots(1, 1, figsize=(10, 10))
in_solar_system = (planets.list == 'Solar System').rename('in solar system?')
ax = sns.scatterplot(
    x=planets.semimajoraxis,  # Pass as a keyword argument
    y=planets.period,         # Pass as a keyword argument
    hue=in_solar_system,
    ax=axes
)
ax.set_yscale('log')
solar_system = planets[planets.list == 'Solar System']
for planet in solar_system.name:
    data = solar_system.query(f'name == "{planet}"')
    ax.annotate(
        planet, 
        (data.semimajoraxis, data.period), 
        (7 + data.semimajoraxis, data.period),
        arrowprops=dict(arrowstyle='->')
    )
ax.set_title('log(orbital period) vs. semi-major axis')


Finding Similar Planets with k-Means Clustering

In [None]:
from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

kmeans_pipeline = Pipeline(
    [
        ('scale', StandardScaler()), 
        ('kmeans', KMeans(8, random_state=0))
    ]
)

In [None]:
kmeans_data = planets[['semimajoraxis', 'period']].dropna()
kmeans_pipeline.fit(kmeans_data)

In [None]:
fig, axes = plt.subplots(1, 1, figsize=(7, 7))
ax = sns.scatterplot(
    x=kmeans_data.semimajoraxis,  # Use 'x' as the keyword argument
    y=kmeans_data.period,         # Use 'y' as the keyword argument
    hue=kmeans_pipeline.predict(kmeans_data),
    ax=axes, palette='Accent'
)
ax.set_yscale('log')
solar_system = planets[planets.list == 'Solar System']
for planet in solar_system.name:
    data = solar_system.query(f'name == "{planet}"')
    ax.annotate(
        planet, 
        (data.semimajoraxis, data.period), 
        (7 + data.semimajoraxis, data.period),
        arrowprops=dict(arrowstyle='->')
    )
ax.get_legend().remove()
ax.set_title('KMeans Clusters')


In [None]:
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

import numpy as np

def elbow_point(X):
    wcss = []  # List to hold within-cluster sum of squares (WCSS)
    for k in range(1, 11):  # Test for k = 1 to k = 10
        kmeans = KMeans(n_clusters=k, random_state=0)
        kmeans.fit(X)
        wcss.append(kmeans.inertia_)  # Inertia is WCSS

    # Plot the elbow curve
    plt.plot(range(1, 11), wcss)
    plt.title('Elbow Method For Optimal k')
    plt.xlabel('Number of clusters (k)')
    plt.ylabel('WCSS')
    plt.show()

# Example usage
elbow_point(kmeans_data)

In [None]:
kmeans_pipeline_2 = Pipeline(
    [
        ('scale', StandardScaler()), 
        ('kmeans', KMeans(2, random_state=0))
    ]
).fit(kmeans_data)

fig, axes = plt.subplots(1, 1, figsize=(7, 7))

# Correct way to pass the data to scatterplot() by using 'x' and 'y' as keyword arguments
ax = sns.scatterplot(
    x=kmeans_data.semimajoraxis,  # Pass 'x' as a keyword argument
    y=kmeans_data.period,         # Pass 'y' as a keyword argument
    hue=kmeans_pipeline_2.predict(kmeans_data),
    ax=axes, palette='Accent'
)

ax.set_yscale('log')

# Annotate planets in the solar system
solar_system = planets[planets.list == 'Solar System']
for planet in solar_system.name:
    data = solar_system.query(f'name == "{planet}"')
    ax.annotate(
        planet, 
        (data.semimajoraxis, data.period), 
        (7 + data.semimajoraxis, data.period),
        arrowprops=dict(arrowstyle='->')
    )

ax.get_legend().remove()
ax.set_title('KMeans Clusters')

Visualizing the cluster space

In [None]:
kmeans_pipeline_2.named_steps['kmeans'].cluster_centers_

In [None]:
# set up layout
fig = plt.figure(figsize=(8, 6))
outside = fig.add_axes([0.1, 0.1, 0.9, 0.9])
inside = fig.add_axes([0.6, 0.2, 0.35, 0.35])

# scaled data and cluster distance data
scaled = kmeans_pipeline_2.named_steps['scale'].fit_transform(
    kmeans_data
)
cluster_distances = kmeans_pipeline_2.fit_transform(
    kmeans_data
)

for ax, data, title, axes_labels in zip(
    [outside, inside], [scaled, cluster_distances],  
    ['Visualizing Clusters', 'Cluster Distance Space'], 
    ['standardized', 'distance to centroid']
):
    ax = sns.scatterplot(
        x=data[:, 0],  # Pass 'x' as a keyword argument
        y=data[:, 1],  # Pass 'y' as a keyword argument
        ax=ax, 
        palette='Accent', 
        alpha=0.5,
        hue=kmeans_pipeline_2.named_steps['kmeans'].labels_, 
        s=100
    )

    ax.get_legend().remove()
    ax.set_title(title)
    ax.set_xlabel(f'semimajoraxis ({axes_labels})')
    ax.set_ylabel(f'period ({axes_labels})')
    ax.set_ylim(-1, None)
    
# add the centroids to the outside plot
cluster_centers = kmeans_pipeline_2.named_steps['kmeans']\
                    .cluster_centers_
for color, centroid in zip(['green', 'purple'], cluster_centers):
    outside.plot(*centroid, color=color, marker='x')
    outside.annotate(
        f'{color} center', xy=centroid, xytext=centroid + [0, 5], 
        arrowprops=dict(arrowstyle='->')
    )


Evaluation of model
There are many metrics to choose from, but since we don't know the true labels of our data, we can only use unsupervised ones. We will use a few different metrics to get a more well-rounded view of our performance:

Silhouette Score
true labels not known
higher = better defined (more separated) clusters
-1 is worst, 1 is best, near 0 indicates overlapping clusters

In [None]:
from sklearn.metrics import silhouette_score
silhouette_score(kmeans_data, kmeans_pipeline.predict(kmeans_data)) 

Davies-Bouldin Score
true labels not known
ratio of within-cluster distances to between-cluster distances
zero is best parition

In [None]:
from sklearn.metrics import davies_bouldin_score
davies_bouldin_score(kmeans_data, kmeans_pipeline.predict(kmeans_data)) 

Calinski and Harabaz Score
true labels not known
higher = better defined (more separated) clusters

In [None]:
from sklearn.metrics import calinski_harabasz_score

# Assuming kmeans_data and kmeans_pipeline are properly defined
score = calinski_harabasz_score(kmeans_data, kmeans_pipeline.predict(kmeans_data))
print(score)


Predicting Length of Year in Earth Days (Period)
separate x and y data, dropping nulls
create the training and testing sets
train a linear regression model (no pipeline since we want to interpret the coefficients)
isolate the coefficients from the model
evaluate the model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

# 1
data = planets[
    ['semimajoraxis', 'period', 'mass', 'eccentricity']
].dropna()
X = data[['semimajoraxis', 'mass', 'eccentricity']]
y = data.period

# 2
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=0
)

In [None]:
# Linear Regression:
lm = LinearRegression().fit(X_train, y_train)

In [None]:
# Get equation :
# get intercept
lm.intercept_

In [None]:
# 4. get coefficients
[(col, coef) for col, coef in zip(X_train.columns, lm.coef_)]

In [None]:
# Evaluation of model :
# 5
preds = lm.predict(X_test)
np.corrcoef(y_test, preds)[0][1]

In [None]:
fig, axes = plt.subplots(1, 1, figsize=(5, 3))
axes.plot(X_test.semimajoraxis, y_test, 'bo', label='actuals', alpha=0.5)
axes.plot(X_test.semimajoraxis, preds, 'ro', label='predictions', alpha=0.5)
plt.xlabel('semimajoraxis')
plt.ylabel('period')
plt.legend()
plt.suptitle('Linear Regression')

In [None]:
# R-squared :
lm.score(X_test, y_test)

In [None]:
from sklearn.metrics import r2_score
r2_score(y_test, preds)

In [None]:
# Anscombe's Quartet
# All four data sets have the same summary statistics (mean, standard deviation, correlation coefficient), despite having different data:
anscombe = sns.load_dataset('anscombe').groupby('dataset')
anscombe.describe()

In [None]:
# When fitted with a regression line, they all have the same R-squared despite some of them not indicating a linear relationship between x and y:
from sklearn.metrics import r2_score

fig, axes = plt.subplots(2, 2, figsize=(12, 12))
axes = axes.flatten()
titles = ['linear', 'non-linear', 'linear with outlier', 'vertical with outlier']

for ax, (group_name, group_data), title in zip(axes, anscombe, titles):
    x, y = group_data.x, group_data.y
    ax.scatter(x, y)
    ax.set_title(f'{group_name} - {title}')
    ax.set_xlabel('x')
    ax.set_ylabel('y')
    ax.set_xlim((3, 19.5))
    ax.set_ylim((2, 13))
    
    # get regression line
    m, b = np.polyfit(x, y, 1)
    reg_x = np.append([0, 20], x)
    reg_y = [m*num + b for num in reg_x]
    ax.plot(reg_x, reg_y, 'r--')
    ax.annotate(
        f"""ρ = {np.corrcoef(x,y)[0][1]:.2}\ny = {m:.2}x + {b:.2}\n\n{
            r'$R^2$'
        } = {r2_score(y, [m*num + b for num in x]):.2}\n\n{
            r'$μ_x$'
        } = {np.mean(x):2} | {
            r'$σ_x$'
        } = {np.std(x):.2}\n{
            r'$μ_y$'
        } = {np.mean(y):.2} | {r'$σ_y$'} = {np.std(y):.2}""", xy=(13, 2.5)
    )
plt.suptitle("Anscombe's Quartet", fontsize=16, y=0.95)

Explained Variance

In [None]:
from sklearn.metrics import explained_variance_score
explained_variance_score(y_test, preds)

Mean Absolute Error (MAE)

In [None]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test, preds)

Root Mean Squared Error (RMSE)

In [None]:
from sklearn.metrics import mean_squared_error
np.sqrt(mean_squared_error(y_test, preds))

Median Absolute Error

In [None]:
from sklearn.metrics import median_absolute_error
median_absolute_error(y_test, preds)