# Baye's Theorem

## Tel Bayes Dataset

In [1]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import BernoulliNB


In [3]:
tel = pd.read_csv("./Datasets/tel_bayes.csv")
tel.head()

Unnamed: 0,TT_gt_100,Gender,Response
0,y,male,not bought
1,n,male,not bought
2,n,female,not bought
3,n,female,not bought
4,n,male,not bought


In [5]:
dum_tel = pd.get_dummies(tel, drop_first=True)
dum_tel.head()

Unnamed: 0,TT_gt_100_y,Gender_male,Response_not bought
0,True,True,True
1,False,True,True
2,False,False,True
3,False,False,True
4,False,True,True


In [7]:
nb = BernoulliNB(alpha=0, force_alpha=True)
X = dum_tel[['TT_gt_100_y','Gender_male']]
y = dum_tel['Response_not bought']
nb.fit(X,y)

In [15]:
# tst = np.array([['y','male']])
dum_tst = np.array([[True, True]])
probs = nb.predict_proba(dum_tst)
print("P[Buying] = ",probs[0][0])
print("P[Not Buying] = ",probs[0][1])
print(nb.predict(dum_tst))

P[Buying] =  0.5294117647058824
P[Not Buying] =  0.47058823529411753
[False]




In [13]:
tst = np.array([['n','female']])
dum_tst = np.array([[False, False]])
probs = nb.predict_proba(dum_tst)
print("P[Buying] = ",probs[0][0])
print("P[Not Buying] = ",probs[0][1])
print(nb.predict(dum_tst))

P[Buying] =  0.3103448275862067
P[Not Buying] =  0.6896551724137933
[ True]




In [17]:
tst = np.array([["n","female"],["n","male"],["y","male"],["y","female"]])
tst = pd.DataFrame(tst, columns= ['TT_gt_100','Gender'])
dum_tst = pd.get_dummies(tst, drop_first=True)
nb.predict_proba(dum_tst)

array([[0.31034483, 0.68965517],
       [0.06976744, 0.93023256],
       [0.52941176, 0.47058824],
       [0.87096774, 0.12903226]])

## Telecom Dataset

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


In [18]:
tel = pd.read_csv("./Datasets/Telecom.csv")
tel.head()

Unnamed: 0,Gender,TT_gt_100,Response
0,F,Y,N
1,M,N,N
2,M,N,N
3,F,Y,Y
4,F,N,N


In [19]:
X = tel.drop('Response', axis=1)
y = tel['Response']

In [20]:
dum_X = pd.get_dummies(X, drop_first=True)
dum_X

Unnamed: 0,Gender_M,TT_gt_100_Y
0,False,True
1,True,False
2,True,False
3,False,True
4,False,False
...,...,...
145,False,True
146,True,False
147,True,False
148,True,False


In [25]:
X_train, X_test, y_train, y_test = train_test_split(dum_X, y, test_size=0.3, random_state=24)
nb = BernoulliNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.8666666666666667


## Breast Cancer Dataset

In [27]:
cancer = pd.read_csv("./Cases/Cancer/Cancer.csv", index_col=0)
cancer.head()

Unnamed: 0_level_0,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat,Class
subjid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,40-49,premeno,15 to 19,0 to 2,yes,three,right,left_up,no,recurrence-events
2,50-59,ge40,15 to 19,0 to 2,no,one,right,central,no,no-recurrence-events
3,50-59,ge40,35 to 39,0 to 2,no,two,left,left_low,no,recurrence-events
4,40-49,premeno,35 to 39,0 to 2,yes,three,right,left_low,yes,no-recurrence-events
5,40-49,premeno,30 to 34,3 to 5,yes,two,left,right_up,no,recurrence-events


In [28]:
X = cancer.drop('Class', axis=1)
y = cancer['Class']
X_dum = pd.get_dummies(X, drop_first=True)
X_dum.head()

Unnamed: 0_level_0,age_30-39,age_40-49,age_50-59,age_60-69,age_70-79,menopause_lt40,menopause_premeno,tumor-size_10 to 14,tumor-size_15 to 19,tumor-size_20 to 24,tumor-size_25 to 29,tumor-size_30 to 34,tumor-size_35 to 39,tumor-size_40 to 44,tumor-size_45 to 49,tumor-size_5 to 9,tumor-size_50 to 54,inv-nodes_12 to 14,inv-nodes_15 to 17,inv-nodes_24 to 26,inv-nodes_3 to 5,inv-nodes_6 to 8,inv-nodes_9 to 11,node-caps_yes,deg-malig_three,deg-malig_two,breast_right,breast-quad_left_low,breast-quad_left_up,breast-quad_right_low,breast-quad_right_up,irradiat_yes
subjid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1
1,False,True,False,False,False,False,True,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,False,True,False,True,False,False,False
2,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False
3,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False,False,False
4,False,True,False,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,True,True,False,True,True,False,False,False,True
5,False,True,False,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,True,False,True,False,False,False,False,True,False


In [35]:
X_train, X_test, y_train, y_test = train_test_split(X_dum, y, test_size=0.3, random_state=24)
nb = BernoulliNB(alpha=0, force_alpha=True)
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.8023255813953488


  self.feature_log_prob_ = np.log(smoothed_fc) - np.log(
  ret = a @ b


# Gaussian Naive Bayes

In [40]:
from sklearn.naive_bayes import GaussianNB

## Bankruptcy Dataset

In [41]:
brupt = pd.read_csv("./Cases/Company Bankruptcy/data.csv")
brupt.head()

Unnamed: 0,Bankrupt?,ROA(C) before interest and depreciation before interest,ROA(A) before interest and % after tax,ROA(B) before interest and depreciation after tax,Operating Gross Margin,Realized Sales Gross Margin,Operating Profit Rate,Pre-tax net Interest Rate,After-tax net Interest Rate,Non-industry income and expenditure/revenue,Continuous interest rate (after tax),Operating Expense Rate,Research and development expense rate,Cash flow rate,Interest-bearing debt interest rate,Tax rate (A),Net Value Per Share (B),Net Value Per Share (A),Net Value Per Share (C),Persistent EPS in the Last Four Seasons,Cash Flow Per Share,Revenue Per Share (Yuan ¥),Operating Profit Per Share (Yuan ¥),Per Share Net profit before tax (Yuan ¥),Realized Sales Gross Profit Growth Rate,Operating Profit Growth Rate,After-tax Net Profit Growth Rate,Regular Net Profit Growth Rate,Continuous Net Profit Growth Rate,Total Asset Growth Rate,Net Value Growth Rate,Total Asset Return Growth Rate Ratio,Cash Reinvestment %,Current Ratio,Quick Ratio,Interest Expense Ratio,Total debt/Total net worth,Debt ratio %,Net worth/Assets,Long-term fund suitability ratio (A),...,Current Assets/Total Assets,Cash/Total Assets,Quick Assets/Current Liability,Cash/Current Liability,Current Liability to Assets,Operating Funds to Liability,Inventory/Working Capital,Inventory/Current Liability,Current Liabilities/Liability,Working Capital/Equity,Current Liabilities/Equity,Long-term Liability to Current Assets,Retained Earnings to Total Assets,Total income/Total expense,Total expense/Assets,Current Asset Turnover Rate,Quick Asset Turnover Rate,Working capitcal Turnover Rate,Cash Turnover Rate,Cash Flow to Sales,Fixed Assets to Assets,Current Liability to Liability,Current Liability to Equity,Equity to Long-term Liability,Cash Flow to Total Assets,Cash Flow to Liability,CFO to Assets,Cash Flow to Equity,Current Liability to Current Assets,Liability-Assets Flag,Net Income to Total Assets,Total assets to GNP price,No-credit Interval,Gross Profit to Sales,Net Income to Stockholder's Equity,Liability to Equity,Degree of Financial Leverage (DFL),Interest Coverage Ratio (Interest expense to EBIT),Net Income Flag,Equity to Liability
0,1,0.370594,0.424389,0.40575,0.601457,0.601457,0.998969,0.796887,0.808809,0.302646,0.780985,0.0001256969,0.0,0.458143,0.000725,0.0,0.14795,0.14795,0.14795,0.169141,0.311664,0.01756,0.095921,0.138736,0.022102,0.848195,0.688979,0.688979,0.217535,4980000000.0,0.000327,0.2631,0.363725,0.002259,0.001208,0.629951,0.021266,0.207576,0.792424,0.005024,...,0.190643,0.004094,0.001997,0.000147336,0.147308,0.334015,0.27692,0.001036,0.676269,0.721275,0.339077,0.025592,0.903225,0.002022,0.064856,701000000.0,6550000000.0,0.593831,458000000.0,0.671568,0.424206,0.676269,0.339077,0.126549,0.637555,0.458609,0.520382,0.312905,0.11825,0,0.716845,0.009219,0.622879,0.601453,0.82789,0.290202,0.026601,0.56405,1,0.016469
1,1,0.464291,0.538214,0.51673,0.610235,0.610235,0.998946,0.79738,0.809301,0.303556,0.781506,0.0002897851,0.0,0.461867,0.000647,0.0,0.182251,0.182251,0.182251,0.208944,0.318137,0.021144,0.093722,0.169918,0.02208,0.848088,0.689693,0.689702,0.21762,6110000000.0,0.000443,0.264516,0.376709,0.006016,0.004039,0.635172,0.012502,0.171176,0.828824,0.005059,...,0.182419,0.014948,0.004136,0.00138391,0.056963,0.341106,0.289642,0.00521,0.308589,0.731975,0.32974,0.023947,0.931065,0.002226,0.025516,0.0001065198,7700000000.0,0.593916,2490000000.0,0.67157,0.468828,0.308589,0.32974,0.120916,0.6411,0.459001,0.567101,0.314163,0.047775,0,0.795297,0.008323,0.623652,0.610237,0.839969,0.283846,0.264577,0.570175,1,0.020794
2,1,0.426071,0.499019,0.472295,0.60145,0.601364,0.998857,0.796403,0.808388,0.302035,0.780284,0.0002361297,25500000.0,0.458521,0.00079,0.0,0.177911,0.177911,0.193713,0.180581,0.307102,0.005944,0.092338,0.142803,0.02276,0.848094,0.689463,0.68947,0.217601,7280000000.0,0.000396,0.264184,0.368913,0.011543,0.005348,0.629631,0.021248,0.207516,0.792484,0.0051,...,0.602806,0.000991,0.006302,5340000000.0,0.098162,0.336731,0.277456,0.013879,0.446027,0.742729,0.334777,0.003715,0.909903,0.00206,0.021387,0.001791094,0.001022676,0.594502,761000000.0,0.671571,0.276179,0.446027,0.334777,0.117922,0.642765,0.459254,0.538491,0.314515,0.025346,0,0.77467,0.040003,0.623841,0.601449,0.836774,0.290189,0.026555,0.563706,1,0.016474
3,1,0.399844,0.451265,0.457733,0.583541,0.583541,0.9987,0.796967,0.808966,0.30335,0.781241,0.0001078888,0.0,0.465705,0.000449,0.0,0.154187,0.154187,0.154187,0.193722,0.321674,0.014368,0.077762,0.148603,0.022046,0.848005,0.68911,0.68911,0.217568,4880000000.0,0.000382,0.263371,0.384077,0.004194,0.002896,0.630228,0.009572,0.151465,0.848535,0.005047,...,0.225815,0.018851,0.002961,0.001010646,0.098715,0.348716,0.27658,0.00354,0.615848,0.729825,0.331509,0.022165,0.906902,0.001831,0.024161,8140000000.0,6050000000.0,0.593889,2030000000.0,0.671519,0.559144,0.615848,0.331509,0.12076,0.579039,0.448518,0.604105,0.302382,0.06725,0,0.739555,0.003252,0.622929,0.583538,0.834697,0.281721,0.026697,0.564663,1,0.023982
4,1,0.465022,0.538432,0.522298,0.598783,0.598783,0.998973,0.797366,0.809304,0.303475,0.78155,7890000000.0,0.0,0.462746,0.000686,0.0,0.167502,0.167502,0.167502,0.212537,0.319162,0.02969,0.096898,0.168412,0.022096,0.848258,0.689697,0.689697,0.217626,5510000000.0,0.000439,0.265218,0.37969,0.006022,0.003727,0.636055,0.00515,0.106509,0.893491,0.005303,...,0.35838,0.014161,0.004275,0.0006804636,0.110195,0.344639,0.287913,0.004869,0.975007,0.732,0.330726,0.0,0.91385,0.002224,0.026385,6680000000.0,5050000000.0,0.593915,824000000.0,0.671563,0.309555,0.975007,0.330726,0.110933,0.622374,0.454411,0.578469,0.311567,0.047725,0,0.795016,0.003878,0.623521,0.598782,0.839973,0.278514,0.024752,0.575617,1,0.03549


In [44]:
X = brupt.drop('Bankrupt?', axis=1)
y = brupt['Bankrupt?']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=24, stratify=y)
nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.06549364613880743


## Sonar Dataset

In [45]:
sonar = pd.read_csv("./Cases/Sonar/Sonar.csv")
sonar.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,V29,V30,V31,V32,V33,V34,V35,V36,V37,V38,V39,V40,V41,V42,V43,V44,V45,V46,V47,V48,V49,V50,V51,V52,V53,V54,V55,V56,V57,V58,V59,V60,Class
0,0.02,0.0371,0.0428,0.0207,0.0954,0.0986,0.1539,0.1601,0.3109,0.2111,0.1609,0.1582,0.2238,0.0645,0.066,0.2273,0.31,0.2999,0.5078,0.4797,0.5783,0.5071,0.4328,0.555,0.6711,0.6415,0.7104,0.808,0.6791,0.3857,0.1307,0.2604,0.5121,0.7547,0.8537,0.8507,0.6692,0.6097,0.4943,0.2744,0.051,0.2834,0.2825,0.4256,0.2641,0.1386,0.1051,0.1343,0.0383,0.0324,0.0232,0.0027,0.0065,0.0159,0.0072,0.0167,0.018,0.0084,0.009,0.0032,R
1,0.0453,0.0523,0.0843,0.0689,0.1183,0.2583,0.2156,0.3481,0.3337,0.2872,0.4918,0.6552,0.6919,0.7797,0.7464,0.9444,1.0,0.8874,0.8024,0.7818,0.5212,0.4052,0.3957,0.3914,0.325,0.32,0.3271,0.2767,0.4423,0.2028,0.3788,0.2947,0.1984,0.2341,0.1306,0.4182,0.3835,0.1057,0.184,0.197,0.1674,0.0583,0.1401,0.1628,0.0621,0.0203,0.053,0.0742,0.0409,0.0061,0.0125,0.0084,0.0089,0.0048,0.0094,0.0191,0.014,0.0049,0.0052,0.0044,R
2,0.0262,0.0582,0.1099,0.1083,0.0974,0.228,0.2431,0.3771,0.5598,0.6194,0.6333,0.706,0.5544,0.532,0.6479,0.6931,0.6759,0.7551,0.8929,0.8619,0.7974,0.6737,0.4293,0.3648,0.5331,0.2413,0.507,0.8533,0.6036,0.8514,0.8512,0.5045,0.1862,0.2709,0.4232,0.3043,0.6116,0.6756,0.5375,0.4719,0.4647,0.2587,0.2129,0.2222,0.2111,0.0176,0.1348,0.0744,0.013,0.0106,0.0033,0.0232,0.0166,0.0095,0.018,0.0244,0.0316,0.0164,0.0095,0.0078,R
3,0.01,0.0171,0.0623,0.0205,0.0205,0.0368,0.1098,0.1276,0.0598,0.1264,0.0881,0.1992,0.0184,0.2261,0.1729,0.2131,0.0693,0.2281,0.406,0.3973,0.2741,0.369,0.5556,0.4846,0.314,0.5334,0.5256,0.252,0.209,0.3559,0.626,0.734,0.612,0.3497,0.3953,0.3012,0.5408,0.8814,0.9857,0.9167,0.6121,0.5006,0.321,0.3202,0.4295,0.3654,0.2655,0.1576,0.0681,0.0294,0.0241,0.0121,0.0036,0.015,0.0085,0.0073,0.005,0.0044,0.004,0.0117,R
4,0.0762,0.0666,0.0481,0.0394,0.059,0.0649,0.1209,0.2467,0.3564,0.4459,0.4152,0.3952,0.4256,0.4135,0.4528,0.5326,0.7306,0.6193,0.2032,0.4636,0.4148,0.4292,0.573,0.5399,0.3161,0.2285,0.6995,1.0,0.7262,0.4724,0.5103,0.5459,0.2881,0.0981,0.1951,0.4181,0.4604,0.3217,0.2828,0.243,0.1979,0.2444,0.1847,0.0841,0.0692,0.0528,0.0357,0.0085,0.023,0.0046,0.0156,0.0031,0.0054,0.0105,0.011,0.0015,0.0072,0.0048,0.0107,0.0094,R


In [46]:
X = sonar.drop('Class', axis=1)
y = sonar['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=24, stratify=y)
nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.7142857142857143


## Rice dataset (UCI)

In [47]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
rice_cammeo_and_osmancik = fetch_ucirepo(id=545) 
  
# data (as pandas dataframes) 
X = rice_cammeo_and_osmancik.data.features 
y = rice_cammeo_and_osmancik.data.targets 
  
# metadata 
print(rice_cammeo_and_osmancik.metadata) 
  
# variable information 
print(rice_cammeo_and_osmancik.variables) 


{'uci_id': 545, 'name': 'Rice (Cammeo and Osmancik)', 'repository_url': 'https://archive.ics.uci.edu/dataset/545/rice+cammeo+and+osmancik', 'data_url': 'https://archive.ics.uci.edu/static/public/545/data.csv', 'abstract': "A total of 3810 rice grain's images were taken for the two species, processed and feature inferences were made. 7 morphological features were obtained for each grain of rice.", 'area': 'Biology', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 3810, 'num_features': 7, 'feature_types': ['Real'], 'demographics': [], 'target_col': ['Class'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2019, 'last_updated': 'Fri Nov 03 2023', 'dataset_doi': '10.24432/C5MW4Z', 'creators': [], 'intro_paper': {'ID': 276, 'type': 'NATIVE', 'title': 'Classification of Rice Varieties Using Artificial Intelligence Methods', 'authors': 'Ilkay Cinar, M. Koklu', 'venue': 'International Journal of Intel

### GaussianNB 

In [57]:
X.head()

Unnamed: 0,Area,Perimeter,Major_Axis_Length,Minor_Axis_Length,Eccentricity,Convex_Area,Extent
0,15231,525.578979,229.749878,85.093788,0.928882,15617,0.572896
1,14656,494.311005,206.020065,91.730972,0.895405,15072,0.615436
2,14634,501.122009,214.106781,87.768288,0.912118,14954,0.693259
3,13176,458.342987,193.337387,87.448395,0.891861,13368,0.640669
4,14688,507.166992,211.743378,89.312454,0.906691,15262,0.646024


In [54]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=24, stratify=y)
nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.9037620297462817


  y = column_or_1d(y, warn=True)


### KNN MinMaxScale

In [52]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline


In [61]:
n_values = np.arange(1,11)
accuracy = []
for i in n_values:
    scaler = MinMaxScaler()
    knn = KNeighborsClassifier(n_neighbors=i)
    pipe = Pipeline([('SCL', scaler), ('KNN', knn)])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    # accuracy.append({'NN':i, 'r2_score': r2_score(y_test, y_pred)})
    accuracy.append(accuracy_score(y_test, y_pred))

i_max = np.argmax(accuracy)
print("Best n_value = ", n_values[i_max])
print("Best r2 score = ", accuracy[i_max])

  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)


Best n_value =  8
Best r2 score =  0.9212598425196851


  return self._fit(X, y)


### KNN StdScaler

In [63]:
n_values = np.arange(1,11)
accuracy = []
for i in n_values:
    scaler = StandardScaler()
    knn = KNeighborsClassifier(n_neighbors=i)
    pipe = Pipeline([('SCL', scaler), ('KNN', knn)])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    # accuracy.append({'NN':i, 'r2_score': r2_score(y_test, y_pred)})
    accuracy.append(accuracy_score(y_test, y_pred))

i_max = np.argmax(accuracy)
print("Best n_value = ", n_values[i_max])
print("Best r2 score = ", accuracy[i_max])

  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)
  return self._fit(X, y)


Best n_value =  6
Best r2 score =  0.9247594050743657


  return self._fit(X, y)
  return self._fit(X, y)


## Wine Quality Dataset

In [67]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
wine_quality = fetch_ucirepo(id=186) 
  
# data (as pandas dataframes) 
X = wine_quality.data.features 
y = wine_quality.data.targets 
  
# metadata 
print(wine_quality.metadata) 
  
# variable information 
print(wine_quality.variables) 


{'uci_id': 186, 'name': 'Wine Quality', 'repository_url': 'https://archive.ics.uci.edu/dataset/186/wine+quality', 'data_url': 'https://archive.ics.uci.edu/static/public/186/data.csv', 'abstract': 'Two datasets are included, related to red and white vinho verde wine samples, from the north of Portugal. The goal is to model wine quality based on physicochemical tests (see [Cortez et al., 2009], http://www3.dsi.uminho.pt/pcortez/wine/).', 'area': 'Business', 'tasks': ['Classification', 'Regression'], 'characteristics': ['Multivariate'], 'num_instances': 4898, 'num_features': 11, 'feature_types': ['Real'], 'demographics': [], 'target_col': ['quality'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2009, 'last_updated': 'Wed Nov 15 2023', 'dataset_doi': '10.24432/C56S3T', 'creators': ['Paulo Cortez', 'A. Cerdeira', 'F. Almeida', 'T. Matos', 'J. Reis'], 'intro_paper': {'ID': 252, 'type': 'NATIVE', 'title': 'Modeling wine preferences

In [70]:
X.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4


In [68]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=24, stratify=y)