In [11]:
### Run this cell before continuing.

import altair as alt
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import set_config
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV, cross_validate, train_test_split
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor

# Simplify working with large datasets in Altair
alt.data_transformers.disable_max_rows()

# Output dataframes instead of arrays
set_config(transform_output="pandas")

# Function needed to visualize images
# code below sourced from: https://gist.github.com/daviddalpiaz/ae62ae5ccd0bada4b9acd6dbc9008706
def show_digit(arr784):
    plt.imshow(np.array(arr784)[1:].reshape(28, 28), cmap="gray")
    
np.random.seed(1137110237) #Randomly picked seed

In [12]:
titles = ["mean_integrated_profile", "standard_dev_integrated_profile", "excess_kurtosis_integrated_profile", "skewness_integrated_profile",
         "mean_DM-SNR_curve", "standard_dev_DM-SNR_curve", "excess_kurtosis_DM-SNR_curve", "skewness_DM-SNR_curve", "class"]

TEMP_DATA_NAME = pd.read_csv("https://raw.githubusercontent.com/fyip3/ds_project/main/data/HTRU_2.csv", names=titles, on_bad_lines="skip")

In [13]:
TEMP_DATA_NAME[TEMP_DATA_NAME["class"] == 1]

Unnamed: 0,mean_integrated_profile,standard_dev_integrated_profile,excess_kurtosis_integrated_profile,skewness_integrated_profile,mean_DM-SNR_curve,standard_dev_DM-SNR_curve,excess_kurtosis_DM-SNR_curve,skewness_DM-SNR_curve,class
19,99.367188,41.572202,1.547197,4.154106,27.555184,61.719016,2.208808,3.662680,1
42,120.554688,45.549905,0.282924,0.419909,1.358696,13.079034,13.312141,212.597029,1
61,27.765625,28.666042,5.770087,37.419009,73.112876,62.070220,1.268206,1.082920,1
92,23.625000,29.948654,5.688038,35.987172,146.568562,82.394624,-0.274902,-1.121848,1
93,94.585938,35.779823,1.187309,3.687469,6.071070,29.760400,5.318767,28.698048,1
...,...,...,...,...,...,...,...,...,...
17515,89.867188,47.482295,1.591325,2.505057,0.763378,12.393561,17.940745,333.902630,1
17529,27.039062,33.754722,4.779124,26.255357,129.863712,78.815440,-0.348492,-0.893791,1
17558,77.070312,39.000638,1.884421,6.372178,38.517559,65.741059,1.589513,1.285346,1
17642,28.375000,27.649311,6.377273,45.944048,141.860368,82.893017,-0.477222,-1.067880,1


In [14]:
TEMP_DATA_NAME_train, TEMP_DATA_NAME_test = train_test_split(
    TEMP_DATA_NAME,
    test_size=.25,
)

In [15]:
TEMP_DATA_NAME_train.head(50)

Unnamed: 0,mean_integrated_profile,standard_dev_integrated_profile,excess_kurtosis_integrated_profile,skewness_integrated_profile,mean_DM-SNR_curve,standard_dev_DM-SNR_curve,excess_kurtosis_DM-SNR_curve,skewness_DM-SNR_curve,class
3351,90.789062,35.814987,1.782977,6.570318,7.310201,33.496492,5.158216,27.061486,1
13363,94.414062,49.408933,0.554283,0.041587,19.096154,54.898157,2.690077,5.648402,0
8723,102.898438,48.011366,0.682278,0.994826,5.561873,27.745867,5.557417,32.201803,0
15822,171.085938,53.557654,-1.137735,0.79333,44.762542,56.75377,1.204525,1.154686,0
15397,136.265625,40.448363,-0.106334,0.586721,0.944816,11.205751,17.095858,339.891205,0
10615,114.265625,48.402687,0.256552,0.192939,3.342809,20.798915,7.334235,59.216131,0
5288,115.171875,49.570043,0.010619,-0.37622,0.714047,11.575122,17.542459,325.634031,0
8377,96.59375,49.572932,0.531951,0.407986,1.249164,14.416283,13.091413,183.472352,0
9101,92.992188,39.645794,0.282909,1.182402,7.035953,32.346957,4.97689,25.021536,0
2322,119.53125,45.942379,0.02122,0.15582,2.400502,14.012571,9.07624,114.025303,0
