# Rock vs Mine Prediction
- YouTube Tutorial: https://www.youtube.com/watch?v=fiz1ORTBGpY&list=PLfFghEzKVmjvuSA67LszN1dZ-Dd_pkus6&ab_channel=Siddhardhan

In [76]:
import numpy as np
import pandas as pd

In [77]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Data Collection and Data Processing

In [78]:
# loading dataset to a pandas Dataframe
sonar_data = pd.read_csv('sonar_data.csv')
sonar_data

Unnamed: 0,0.0200,0.0371,0.0428,0.0207,0.0954,0.0986,0.1539,0.1601,0.3109,0.2111,...,0.0027,0.0065,0.0159,0.0072,0.0167,0.0180,0.0084,0.0090,0.0032,R
0,0.0453,0.0523,0.0843,0.0689,0.1183,0.2583,0.2156,0.3481,0.3337,0.2872,...,0.0084,0.0089,0.0048,0.0094,0.0191,0.0140,0.0049,0.0052,0.0044,R
1,0.0262,0.0582,0.1099,0.1083,0.0974,0.2280,0.2431,0.3771,0.5598,0.6194,...,0.0232,0.0166,0.0095,0.0180,0.0244,0.0316,0.0164,0.0095,0.0078,R
2,0.0100,0.0171,0.0623,0.0205,0.0205,0.0368,0.1098,0.1276,0.0598,0.1264,...,0.0121,0.0036,0.0150,0.0085,0.0073,0.0050,0.0044,0.0040,0.0117,R
3,0.0762,0.0666,0.0481,0.0394,0.0590,0.0649,0.1209,0.2467,0.3564,0.4459,...,0.0031,0.0054,0.0105,0.0110,0.0015,0.0072,0.0048,0.0107,0.0094,R
4,0.0286,0.0453,0.0277,0.0174,0.0384,0.0990,0.1201,0.1833,0.2105,0.3039,...,0.0045,0.0014,0.0038,0.0013,0.0089,0.0057,0.0027,0.0051,0.0062,R
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
202,0.0187,0.0346,0.0168,0.0177,0.0393,0.1630,0.2028,0.1694,0.2328,0.2684,...,0.0116,0.0098,0.0199,0.0033,0.0101,0.0065,0.0115,0.0193,0.0157,M
203,0.0323,0.0101,0.0298,0.0564,0.0760,0.0958,0.0990,0.1018,0.1030,0.2154,...,0.0061,0.0093,0.0135,0.0063,0.0063,0.0034,0.0032,0.0062,0.0067,M
204,0.0522,0.0437,0.0180,0.0292,0.0351,0.1171,0.1257,0.1178,0.1258,0.2529,...,0.0160,0.0029,0.0051,0.0062,0.0089,0.0140,0.0138,0.0077,0.0031,M
205,0.0303,0.0353,0.0490,0.0608,0.0167,0.1354,0.1465,0.1123,0.1945,0.2354,...,0.0086,0.0046,0.0126,0.0036,0.0035,0.0034,0.0079,0.0036,0.0048,M


### Problem: No header / column header values

### Solution: Set `header = None`

In [79]:
# you'll realize that there are no header/column values
# you either need to have `name=['some','header','value]` or can define header to be none
sonar_data = pd.read_csv('sonar_data.csv', header=None)
sonar_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,51,52,53,54,55,56,57,58,59,60
0,0.0200,0.0371,0.0428,0.0207,0.0954,0.0986,0.1539,0.1601,0.3109,0.2111,...,0.0027,0.0065,0.0159,0.0072,0.0167,0.0180,0.0084,0.0090,0.0032,R
1,0.0453,0.0523,0.0843,0.0689,0.1183,0.2583,0.2156,0.3481,0.3337,0.2872,...,0.0084,0.0089,0.0048,0.0094,0.0191,0.0140,0.0049,0.0052,0.0044,R
2,0.0262,0.0582,0.1099,0.1083,0.0974,0.2280,0.2431,0.3771,0.5598,0.6194,...,0.0232,0.0166,0.0095,0.0180,0.0244,0.0316,0.0164,0.0095,0.0078,R
3,0.0100,0.0171,0.0623,0.0205,0.0205,0.0368,0.1098,0.1276,0.0598,0.1264,...,0.0121,0.0036,0.0150,0.0085,0.0073,0.0050,0.0044,0.0040,0.0117,R
4,0.0762,0.0666,0.0481,0.0394,0.0590,0.0649,0.1209,0.2467,0.3564,0.4459,...,0.0031,0.0054,0.0105,0.0110,0.0015,0.0072,0.0048,0.0107,0.0094,R
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203,0.0187,0.0346,0.0168,0.0177,0.0393,0.1630,0.2028,0.1694,0.2328,0.2684,...,0.0116,0.0098,0.0199,0.0033,0.0101,0.0065,0.0115,0.0193,0.0157,M
204,0.0323,0.0101,0.0298,0.0564,0.0760,0.0958,0.0990,0.1018,0.1030,0.2154,...,0.0061,0.0093,0.0135,0.0063,0.0063,0.0034,0.0032,0.0062,0.0067,M
205,0.0522,0.0437,0.0180,0.0292,0.0351,0.1171,0.1257,0.1178,0.1258,0.2529,...,0.0160,0.0029,0.0051,0.0062,0.0089,0.0140,0.0138,0.0077,0.0031,M
206,0.0303,0.0353,0.0490,0.0608,0.0167,0.1354,0.1465,0.1123,0.1945,0.2354,...,0.0086,0.0046,0.0126,0.0036,0.0035,0.0034,0.0079,0.0036,0.0048,M


### See shape of dataframe (i.e. how many rows and columns)

In [80]:
# 208 rows and 61 columns
sonar_data.shape

(208, 61)

### See statistical measure of our data - mean, standard deviation, etc.

In [81]:
sonar_data.describe()
# let's focus on first column to understand what each section means
# count: The number of non-null (or non-missing) values - i.e. col 0 has 208 non null values
# mean: The mean (or average) value; for col 0, the mean is 0.29164
# std: The standard deviation, which measures the amount of variation or dispersion of a set of values.
# min: The minimum value.
# 25%: The 25th percentile (often referred to as the first quartile). 25% of the data falls below this value.
# 50%: The median or 50th percentile (also called the second quartile). Half of the data falls below this point.
# 75%: The 75th percentile (or third quartile). 75% of the data falls below this value.
# max: The maximum value.

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,50,51,52,53,54,55,56,57,58,59
count,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,...,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0
mean,0.029164,0.038437,0.043832,0.053892,0.075202,0.10457,0.121747,0.134799,0.178003,0.208259,...,0.016069,0.01342,0.010709,0.010941,0.00929,0.008222,0.00782,0.007949,0.007941,0.006507
std,0.022991,0.03296,0.038428,0.046528,0.055552,0.059105,0.061788,0.085152,0.118387,0.134416,...,0.012008,0.009634,0.00706,0.007301,0.007088,0.005736,0.005785,0.00647,0.006181,0.005031
min,0.0015,0.0006,0.0015,0.0058,0.0067,0.0102,0.0033,0.0055,0.0075,0.0113,...,0.0,0.0008,0.0005,0.001,0.0006,0.0004,0.0003,0.0003,0.0001,0.0006
25%,0.01335,0.01645,0.01895,0.024375,0.03805,0.067025,0.0809,0.080425,0.097025,0.111275,...,0.008425,0.007275,0.005075,0.005375,0.00415,0.0044,0.0037,0.0036,0.003675,0.0031
50%,0.0228,0.0308,0.0343,0.04405,0.0625,0.09215,0.10695,0.1121,0.15225,0.1824,...,0.0139,0.0114,0.00955,0.0093,0.0075,0.00685,0.00595,0.0058,0.0064,0.0053
75%,0.03555,0.04795,0.05795,0.0645,0.100275,0.134125,0.154,0.1696,0.233425,0.2687,...,0.020825,0.016725,0.0149,0.0145,0.0121,0.010575,0.010425,0.01035,0.010325,0.008525
max,0.1371,0.2339,0.3059,0.4264,0.401,0.3823,0.3729,0.459,0.6828,0.7106,...,0.1004,0.0709,0.039,0.0352,0.0447,0.0394,0.0355,0.044,0.0364,0.0439


### See how many rocks and mine classes we have
- We realize there are 111 Mines and 97 Rocks
- Implication:
  - We know that the distribution of rock and mine are reasonably equal, so the prediction won't be too biased.
  - However, the number of sample is quite small; ideally we want to have a lot more data.

In [82]:
# we put 60 here because our label/class is at the 60th index; if you have a proper column name, you would do sonar_data['class'].value_counts() instead
# also, if you have, say, a column that has True/False, like is_male, you can also do sonar_data['is_male'].value_counts()
sonar_data[60].value_counts()

60
M    111
R     97
Name: count, dtype: int64

### Group data based on mine or rock
- i.e. we group all the datapoints for Mine and for Rock, then see the mean value
- Implications
  - we realize that the mean value for Mine in column 1 is higher than the mean avalue for Rock in column 1
  - that may suggest that for column 2 (i.e. maybe that represents weight of material), that in general, Mine is heavier than Rock.

In [83]:
sonar_data.groupby(60).mean()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,50,51,52,53,54,55,56,57,58,59
60,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
M,0.034989,0.045544,0.05072,0.064768,0.086715,0.111864,0.128359,0.149832,0.213492,0.251022,...,0.019352,0.016014,0.011643,0.012185,0.009923,0.008914,0.007825,0.00906,0.008695,0.00693
R,0.022498,0.030303,0.035951,0.041447,0.062028,0.096224,0.11418,0.117596,0.137392,0.159325,...,0.012311,0.010453,0.00964,0.009518,0.008567,0.00743,0.007814,0.006677,0.007078,0.006024


### Example Groupby Usecase

In [84]:
data = {
    'Product': ['A', 'B', 'A', 'C', 'B', 'A', 'C', 'B'],
    'Date': ['2023-01-01', '2023-01-01', '2023-01-02', '2023-01-02', '2023-01-02', '2023-01-03', '2023-01-03', '2023-01-03'],
    'SaleAmount': [10, 15, 10, 20, 25, 10, 20, 30]
}

df = pd.DataFrame(data)

grouped = df.groupby('Product').sum()
print(grouped)

                                   Date  SaleAmount
Product                                            
A        2023-01-012023-01-022023-01-03          30
B        2023-01-012023-01-022023-01-03          70
C                  2023-01-022023-01-03          40


### Separating Features and Labels

In [85]:
# we will drop the last column, column = 60 (because that represents our label), we specify axis = 1 because axis = 0 means we'll drop a row if we put axis as 0
X = sonar_data.drop(columns=60, axis=1) # axis=1 is redundant becase we already specify column, but good to be clear.
# the 60th column is stored in Y
Y = sonar_data[60]

#### Features

In [86]:
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,50,51,52,53,54,55,56,57,58,59
0,0.0200,0.0371,0.0428,0.0207,0.0954,0.0986,0.1539,0.1601,0.3109,0.2111,...,0.0232,0.0027,0.0065,0.0159,0.0072,0.0167,0.0180,0.0084,0.0090,0.0032
1,0.0453,0.0523,0.0843,0.0689,0.1183,0.2583,0.2156,0.3481,0.3337,0.2872,...,0.0125,0.0084,0.0089,0.0048,0.0094,0.0191,0.0140,0.0049,0.0052,0.0044
2,0.0262,0.0582,0.1099,0.1083,0.0974,0.2280,0.2431,0.3771,0.5598,0.6194,...,0.0033,0.0232,0.0166,0.0095,0.0180,0.0244,0.0316,0.0164,0.0095,0.0078
3,0.0100,0.0171,0.0623,0.0205,0.0205,0.0368,0.1098,0.1276,0.0598,0.1264,...,0.0241,0.0121,0.0036,0.0150,0.0085,0.0073,0.0050,0.0044,0.0040,0.0117
4,0.0762,0.0666,0.0481,0.0394,0.0590,0.0649,0.1209,0.2467,0.3564,0.4459,...,0.0156,0.0031,0.0054,0.0105,0.0110,0.0015,0.0072,0.0048,0.0107,0.0094
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203,0.0187,0.0346,0.0168,0.0177,0.0393,0.1630,0.2028,0.1694,0.2328,0.2684,...,0.0203,0.0116,0.0098,0.0199,0.0033,0.0101,0.0065,0.0115,0.0193,0.0157
204,0.0323,0.0101,0.0298,0.0564,0.0760,0.0958,0.0990,0.1018,0.1030,0.2154,...,0.0051,0.0061,0.0093,0.0135,0.0063,0.0063,0.0034,0.0032,0.0062,0.0067
205,0.0522,0.0437,0.0180,0.0292,0.0351,0.1171,0.1257,0.1178,0.1258,0.2529,...,0.0155,0.0160,0.0029,0.0051,0.0062,0.0089,0.0140,0.0138,0.0077,0.0031
206,0.0303,0.0353,0.0490,0.0608,0.0167,0.1354,0.1465,0.1123,0.1945,0.2354,...,0.0042,0.0086,0.0046,0.0126,0.0036,0.0035,0.0034,0.0079,0.0036,0.0048


#### Label

In [87]:
Y

0      R
1      R
2      R
3      R
4      R
      ..
203    M
204    M
205    M
206    M
207    M
Name: 60, Length: 208, dtype: object

### Separating Training and Test Data

In [88]:
# X and Y: These are likely your feature and target datasets, respectively. Typically, in a supervised learning context:

# X contains your input data or features.
# Y contains the labels or target values that correspond to the features in X.
# test_size = 0.1: This means that 10% of the data will be reserved for testing, and the remaining 90% will be used for training. (usually we use 10-20% for testing)

# stratify=Y: This ensures that the distribution of classes (or labels) in the training and test splits are the same as the distribution in the original dataset Y. This is especially useful for datasets where there is a significant imbalance between classes.

# random_state=1: This sets a seed for the random number generator. Setting a seed ensures reproducibility. Every time you run this code with the same seed and data, you'll get the same train/test split.

# The output:

# X_train and Y_train are the feature data and labels, respectively, for training the model.
# X_test and Y_test are the feature data and labels, respectively, used for testing the model's performance.
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.1, stratify=Y, random_state=1)

In [89]:
# merely representing all your X (i.e. features) from your training dataset
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,50,51,52,53,54,55,56,57,58,59
115,0.0414,0.0436,0.0447,0.0844,0.0419,0.1215,0.2002,0.1516,0.0818,0.1975,...,0.0222,0.0045,0.0136,0.0113,0.0053,0.0165,0.0141,0.0077,0.0246,0.0198
38,0.0123,0.0022,0.0196,0.0206,0.0180,0.0492,0.0033,0.0398,0.0791,0.0475,...,0.0149,0.0125,0.0134,0.0026,0.0038,0.0018,0.0113,0.0058,0.0047,0.0071
56,0.0152,0.0102,0.0113,0.0263,0.0097,0.0391,0.0857,0.0915,0.0949,0.1504,...,0.0048,0.0049,0.0041,0.0036,0.0013,0.0046,0.0037,0.0011,0.0034,0.0033
123,0.0270,0.0163,0.0341,0.0247,0.0822,0.1256,0.1323,0.1584,0.2017,0.2122,...,0.0197,0.0189,0.0204,0.0085,0.0043,0.0092,0.0138,0.0094,0.0105,0.0093
18,0.0270,0.0092,0.0145,0.0278,0.0412,0.0757,0.1026,0.1138,0.0794,0.1520,...,0.0045,0.0084,0.0010,0.0018,0.0068,0.0039,0.0120,0.0132,0.0070,0.0088
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140,0.0412,0.1135,0.0518,0.0232,0.0646,0.1124,0.1787,0.2407,0.2682,0.2058,...,0.0798,0.0376,0.0143,0.0272,0.0127,0.0166,0.0095,0.0225,0.0098,0.0085
5,0.0286,0.0453,0.0277,0.0174,0.0384,0.0990,0.1201,0.1833,0.2105,0.3039,...,0.0104,0.0045,0.0014,0.0038,0.0013,0.0089,0.0057,0.0027,0.0051,0.0062
154,0.0117,0.0069,0.0279,0.0583,0.0915,0.1267,0.1577,0.1927,0.2361,0.2169,...,0.0039,0.0053,0.0029,0.0020,0.0013,0.0029,0.0020,0.0062,0.0026,0.0052
131,0.1150,0.1163,0.0866,0.0358,0.0232,0.1267,0.2417,0.2661,0.4346,0.5378,...,0.0228,0.0099,0.0065,0.0085,0.0166,0.0110,0.0190,0.0141,0.0068,0.0086


In [90]:
# merely representing all your Y (i.e. labels of M or R) from your training dataset
Y_train

115    M
38     R
56     R
123    M
18     R
      ..
140    M
5      R
154    M
131    M
203    M
Name: 60, Length: 187, dtype: object

#### Identifying how many instance/rows of data are in our training & testing datasets

In [91]:
# interpretation: 187 rows/instance of X_train and 21 rows of X_test dataset.
print(X.shape, X_train.shape, X_test.shape)

(208, 60) (187, 60) (21, 60)


### Logistic Regression Model Training

In [92]:
model = LogisticRegression()

In [93]:
# train the model with training data
model.fit(X_train, Y_train)

### Model Evaluation

In [94]:
# see the accuracy on test dataset
# pass in your X (features) for test dataset
y_pred = model.predict(X_test)
# pass in your Y (label, actual answer) as well as the prediction for test dataset

# you can find just accuracy
data_accuracy_score = accuracy_score(y_pred, Y_test)
print(data_accuracy_score)

# or just use classification report
print(classification_report(Y_test, y_pred))

0.7619047619047619
              precision    recall  f1-score   support

           M       0.75      0.82      0.78        11
           R       0.78      0.70      0.74        10

    accuracy                           0.76        21
   macro avg       0.76      0.76      0.76        21
weighted avg       0.76      0.76      0.76        21



### Making A Predictive System

In [95]:
import random

def modify_tuple(input_data):
    modified_data = []
    
    for num in input_data:
        # Randomly modify the number by up to ±1%
        modified_num = num + num * random.uniform(-0.01, 0.01)
        modified_num = round(modified_num, 4)
        modified_data.append(modified_num)
        
    return tuple(modified_data)

# i just grabbed the first entry (rock) of sonar_data.csv, then run it to modify_tuple function to randomize the value abit.
input_data = (0.0200,0.0371,0.0428,0.0207,0.0954,0.0986,0.1539,0.1601,0.3109,0.2111,0.1609,0.1582,0.2238,0.0645,0.0660,0.2273,0.3100,0.2999,0.5078,0.4797,0.5783,0.5071,0.4328,0.5550,0.6711,0.6415,0.7104,0.8080,0.6791,0.3857,0.1307,0.2604,0.5121,0.7547,0.8537,0.8507,0.6692,0.6097,0.4943,0.2744,0.0510,0.2834,0.2825,0.4256,0.2641,0.1386,0.1051,0.1343,0.0383,0.0324,0.0232,0.0027,0.0065,0.0159,0.0072,0.0167,0.0180,0.0084,0.0090,0.0032)
updated_tuple = modify_tuple(input_data)
print(updated_tuple)

(0.0199, 0.037, 0.0431, 0.0208, 0.0956, 0.098, 0.1549, 0.1613, 0.3091, 0.2124, 0.1613, 0.1585, 0.2234, 0.064, 0.0662, 0.2264, 0.3102, 0.2974, 0.5061, 0.4751, 0.5801, 0.5054, 0.4319, 0.5576, 0.6771, 0.6372, 0.7093, 0.8085, 0.6834, 0.3886, 0.13, 0.2621, 0.5162, 0.7504, 0.8601, 0.8566, 0.666, 0.6114, 0.4936, 0.2739, 0.051, 0.284, 0.2838, 0.423, 0.2621, 0.1393, 0.106, 0.1347, 0.038, 0.0327, 0.023, 0.0027, 0.0066, 0.016, 0.0073, 0.0169, 0.018, 0.0084, 0.009, 0.0032)


In [97]:
# inputting random features value (we have 60 columns so need 60 features)
input_data = (0.02, 0.0368, 0.0424, 0.0209, 0.0954, 0.0982, 0.1541, 0.1601, 0.3104, 0.2115, 0.16, 0.1575, 0.2231, 0.0651, 0.0661, 0.2255, 0.3097, 0.2977, 0.5106, 0.4792, 0.5777, 0.5037, 0.4337, 0.5538, 0.6725, 0.6467, 0.7128, 0.8019, 0.6852, 0.3853, 0.1298, 0.2611, 0.5081, 0.7594, 0.8563, 0.8558, 0.6743, 0.6083, 0.4948, 0.2721, 0.0509, 0.2807, 0.2833, 0.424, 0.2655, 0.138, 0.1056, 0.1335, 0.0382, 0.0326, 0.0233, 0.0027, 0.0065, 0.016, 0.0072, 0.0166, 0.0182, 0.0085, 0.0089, 0.0032)

# changing the input data to numpy array - convert tuple to numpy array
input_data = np.asarray(input_data)

# reshape the np array
input_data = input_data.reshape(1, -1)

# making a prediction
prediction = model.predict(input_data)
print(prediction)

['R']


In [98]:
if prediction[0] == 'R':
    print('The object is a rock.')
else:
    print('The object is a mine')

The object is a rock.
