In [1]:
# Regular EDA (exploratory data analysis) and plotting libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Data Manipulation Tools
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

# Models and Evaluations from Scikit-Learn
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import plot_roc_curve

from my_ml_functions import * # import custom functions from my_ml_functions.py file

# Problem:
The Sonar Dataset involves the prediction of whether or not an object is a mine or a rock given the strength of sonar returns at different angles. The baseline performance of predicting the most prevalent class is a classification accuracy of approximately 53%. Top results achieve a classification accuracy of approximately 88%.

# Data:
It is a binary (2-class) classification problem. The number of observations for each class is not balanced. There are 208 observations with 60 input variables, in the range [0.0, 1.0] and 1 output variable ('R' for rock, 'M' for mine'. Each number represents the energy within a particular frequency band, integrated over a certain period of time. 

## Read in data, add columns, and inspect data

In [16]:
columns = ['test' + str(x) for x in range(1,61)] + ['target'] # create columns name 'test#' and target column name

df = pd.read_csv('data/sonar.csv', header=None, names=columns)
df.head()

Unnamed: 0,test1,test2,test3,test4,test5,test6,test7,test8,test9,test10,...,test52,test53,test54,test55,test56,test57,test58,test59,test60,target
0,0.02,0.0371,0.0428,0.0207,0.0954,0.0986,0.1539,0.1601,0.3109,0.2111,...,0.0027,0.0065,0.0159,0.0072,0.0167,0.018,0.0084,0.009,0.0032,R
1,0.0453,0.0523,0.0843,0.0689,0.1183,0.2583,0.2156,0.3481,0.3337,0.2872,...,0.0084,0.0089,0.0048,0.0094,0.0191,0.014,0.0049,0.0052,0.0044,R
2,0.0262,0.0582,0.1099,0.1083,0.0974,0.228,0.2431,0.3771,0.5598,0.6194,...,0.0232,0.0166,0.0095,0.018,0.0244,0.0316,0.0164,0.0095,0.0078,R
3,0.01,0.0171,0.0623,0.0205,0.0205,0.0368,0.1098,0.1276,0.0598,0.1264,...,0.0121,0.0036,0.015,0.0085,0.0073,0.005,0.0044,0.004,0.0117,R
4,0.0762,0.0666,0.0481,0.0394,0.059,0.0649,0.1209,0.2467,0.3564,0.4459,...,0.0031,0.0054,0.0105,0.011,0.0015,0.0072,0.0048,0.0107,0.0094,R


In [12]:
df.isna().sum() # no missing values

obs1      0
obs2      0
obs3      0
obs4      0
obs5      0
         ..
obs57     0
obs58     0
obs59     0
obs60     0
target    0
Length: 61, dtype: int64

In [15]:
df.target.value_counts()
# Slightly more observations for M compared to R, but not enough to cause an imbalance

M    111
R     97
Name: target, dtype: int64

## Explore the Data (EDA)