In [7]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import rcParams
import scipy as sp
from scipy import stats
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data' \
                ,header=None)
cols = ['sex','length','diameter','height','whole_weight','shucked_weight','viscera_weight','shell_weight','rings']
df.columns = cols
df.head()

Unnamed: 0,sex,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [8]:
le = LabelEncoder()

df.sex = le.fit_transform(df.sex)
df.head()

Unnamed: 0,sex,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings
0,2,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,2,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,0,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,2,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,1,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [9]:
df.isnull().sum()

sex               0
length            0
diameter          0
height            0
whole_weight      0
shucked_weight    0
viscera_weight    0
shell_weight      0
rings             0
dtype: int64

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4177 entries, 0 to 4176
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   sex             4177 non-null   int32  
 1   length          4177 non-null   float64
 2   diameter        4177 non-null   float64
 3   height          4177 non-null   float64
 4   whole_weight    4177 non-null   float64
 5   shucked_weight  4177 non-null   float64
 6   viscera_weight  4177 non-null   float64
 7   shell_weight    4177 non-null   float64
 8   rings           4177 non-null   int64  
dtypes: float64(7), int32(1), int64(1)
memory usage: 277.5 KB


In [11]:
for i in df:
    print(f'skewness of {i} is {stats.skew(df[i])} and its kurtosis value is {stats.kurtosis(df[i])}')

skewness of sex is -0.09811989796728826 and its kurtosis value is -1.5140105390978127
skewness of length is -0.6396434615451078 and its kurtosis value is 0.06310755265972467
skewness of diameter is -0.6089793517180232 and its kurtosis value is -0.04685725781261718
skewness of height is 3.127693679207538 and its kurtosis value is 75.93309871329375
skewness of whole_weight is 0.5307678720133928 and its kurtosis value is -0.025051305562984627
skewness of shucked_weight is 0.7188396611678955 and its kurtosis value is 0.5929754415645077
skewness of viscera_weight is 0.5916395905344537 and its kurtosis value is 0.08247512417945213
skewness of shell_weight is 0.6207038222275745 and its kurtosis value is 0.5298535134982196
skewness of rings is 1.1137017739656028 and its kurtosis value is 2.3264623620128333


In [53]:
# hence from the above it can be inferred that height and rings are not normally distributed among the data

In [12]:
df.corr()

Unnamed: 0,sex,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings
sex,1.0,-0.036066,-0.038874,-0.042077,-0.021391,-0.001373,-0.032067,-0.034854,-0.034627
length,-0.036066,1.0,0.986812,0.827554,0.925261,0.897914,0.903018,0.897706,0.55672
diameter,-0.038874,0.986812,1.0,0.833684,0.925452,0.893162,0.899724,0.90533,0.57466
height,-0.042077,0.827554,0.833684,1.0,0.819221,0.774972,0.798319,0.817338,0.557467
whole_weight,-0.021391,0.925261,0.925452,0.819221,1.0,0.969405,0.966375,0.955355,0.54039
shucked_weight,-0.001373,0.897914,0.893162,0.774972,0.969405,1.0,0.931961,0.882617,0.420884
viscera_weight,-0.032067,0.903018,0.899724,0.798319,0.966375,0.931961,1.0,0.907656,0.503819
shell_weight,-0.034854,0.897706,0.90533,0.817338,0.955355,0.882617,0.907656,1.0,0.627574
rings,-0.034627,0.55672,0.57466,0.557467,0.54039,0.420884,0.503819,0.627574,1.0


In [13]:
# lets predict the sex of abalone from the above dataset

X = df.iloc[:,1:]
y = df.sex


In [118]:
df.describe()

Unnamed: 0,sex,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings
count,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0,4177.0
mean,1.95547,0.523992,0.407881,0.139516,0.828742,0.359367,0.180594,0.238831,9.933684
std,0.827815,0.120093,0.09924,0.041827,0.490389,0.221963,0.109614,0.139203,3.224169
min,1.0,0.075,0.055,0.0,0.002,0.001,0.0005,0.0015,1.0
25%,1.0,0.45,0.35,0.115,0.4415,0.186,0.0935,0.13,8.0
50%,2.0,0.545,0.425,0.14,0.7995,0.336,0.171,0.234,9.0
75%,3.0,0.615,0.48,0.165,1.153,0.502,0.253,0.329,11.0
max,3.0,0.815,0.65,1.13,2.8255,1.488,0.76,1.005,29.0


In [122]:
df[ df.height == df[df.sex == 2].height.max()] # filtering data with df

Unnamed: 0,sex,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings
2051,2,0.455,0.355,1.13,0.594,0.332,0.116,0.1335,8


In [129]:
df[df.length == df[df.sex == 1].length.max()]

Unnamed: 0,sex,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings
1209,2,0.78,0.63,0.215,2.657,1.488,0.4985,0.586,11
3715,1,0.78,0.6,0.21,2.548,1.1945,0.5745,0.6745,11


In [172]:
grouped_data = df[df.rings > np.mean(df.rings)].groupby('sex')

In [178]:
grouped_data.mean()

Unnamed: 0_level_0,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0.592434,0.465657,0.162808,1.137232,0.484805,0.246481,0.32999,12.271293
2,0.594785,0.469026,0.163867,1.13172,0.470961,0.248647,0.332663,12.462061
3,0.530324,0.411822,0.141377,0.750818,0.315848,0.160451,0.227844,11.870445


In [179]:
grouped_data.median()

Unnamed: 0_level_0,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0.6,0.475,0.165,1.1135,0.4655,0.2375,0.322,11.0
2,0.61,0.475,0.165,1.115,0.4655,0.245,0.3225,11.0
3,0.54,0.42,0.14,0.751,0.3065,0.1575,0.225,11.0


In [192]:
grouped_data.corrwith(df.shell_weight)

Unnamed: 0_level_0,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,0.842469,0.855111,0.771201,0.906488,0.793488,0.80174,1.0,0.26136
2,0.854474,0.863293,0.795518,0.91121,0.779156,0.809528,1.0,0.243459
3,0.889911,0.883942,0.815046,0.942282,0.857158,0.874773,1.0,0.230607


In [14]:
X.shape

(4177, 8)

In [18]:
X.head()

Unnamed: 0,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings
0,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [15]:
y.shape

(4177,)

In [16]:
y.unique()

array([2, 0, 1])

In [20]:
# first lets try to use K-means clustering to find out if we can cluster the dataset into the following

from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, classification_report

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25, random_state=25)

model = KMeans(n_clusters=3, random_state=45)
model.fit(X_train)

y_predict_test = model.predict(X_test)
y_predict_train = model.predict(X_train)

print('test: ', accuracy_score(y_predict_test, y_test))
print('train: ',accuracy_score(y_predict_train, y_train))

test:  0.31770334928229665
train:  0.29278416347381864


In [49]:
# absolutely not the alogrithm to go with ..... lets try to use MLP Neural network for classification and see how it performs

from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import scale

X_scale = scale(X)
X_scale = pd.DataFrame(X_scale)

X_train, X_test, y_train, y_test = train_test_split(X_scale, y, test_size=0.2, random_state=154)

model = MLPClassifier(
    hidden_layer_sizes=(40,40,30),
    max_iter=300,
    learning_rate='adaptive',
    learning_rate_init=0.025
)

model.fit(X_train, y_train)

y_predict_test = model.predict(X_test)
y_predict_train = model.predict(X_train)

print('train: ', accuracy_score(y_train, y_predict_train))
print('test: ', accuracy_score(y_test, y_predict_test))

train:  0.6324453756360371
test:  0.534688995215311


In [50]:
# we are getting much better accuracy with MLP as compared to with K-means but still this model is not worth it
# lets try and see if we can tune some hyperparameters into giving use better results

model = MLPClassifier(
    hidden_layer_sizes=(30,30,25),
    max_iter=700,
    activation='relu',
    learning_rate='adaptive',
    solver='lbfgs',
    learning_rate_init=0.02,
    random_state=320
)

model.fit(X_train, y_train)

y_predict_test = model.predict(X_test)
y_predict_train = model.predict(X_train)

print('train: ', accuracy_score(y_train, y_predict_train))
print('test: ', accuracy_score(y_test, y_predict_test))

train:  0.7330140676444178
test:  0.5454545454545454


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [51]:
# lets try and see if we can produce a decision tree to have better results in this terms

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=17)

# Here we are using DecisionTreeClassifier as we need classification
# for Prediction we can use DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier()

model.fit(X_train, y_train)

y_predict_test = model.predict(X_test)
y_predict_train = model.predict(X_train)

print('train: ', accuracy_score(y_train, y_predict_train))
print('test: ', accuracy_score(y_test, y_predict_test))

train:  1.0
test:  0.4964114832535885


In [54]:
# the decision tree model completely overfit the data so it needs optimization lets tune some hyperparameters

model = DecisionTreeClassifier(max_depth=6, criterion='entropy')

model.fit(X_train, y_train)

y_predict_test = model.predict(X_test)
y_predict_train = model.predict(X_train)

print('train: ', accuracy_score(y_train, y_predict_train))
print('test: ', accuracy_score(y_test, y_predict_test))

train:  0.6058066447171505
test:  0.5801435406698564


In [55]:
# before further optimization lets use random forest

from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(
    n_estimators=100,
    random_state=19,
    min_samples_leaf=5
)

model.fit(X_train, y_train)

y_predict_test = model.predict(X_test)
y_predict_train = model.predict(X_train)

print('train: ', accuracy_score(y_train, y_predict_train))
print('test: ', accuracy_score(y_test, y_predict_test))

train:  0.8656090990721341
test:  0.5729665071770335
