## Data Mining and Machine Learning
### Multiclass Logistic Regression
### Dataset:Vehicle
### Libraries: H20  and scikit-learn 
#### Edgar Acuna
#### March 2021

In [8]:
import h2o
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.estimators.naive_bayes import H2ONaiveBayesEstimator
from h2o.estimators.glm   import H2OGeneralizedLinearEstimator
import warnings
warnings.filterwarnings('ignore')
#h2o.connect()
#h2o.no_progress()
h2o.init(ip="localhost", port=54323)

Checking whether there is an H2O instance running at http://localhost:54323 . connected.


0,1
H2O_cluster_uptime:,19 mins 02 secs
H2O_cluster_timezone:,America/La_Paz
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.32.1.1
H2O_cluster_version_age:,10 days
H2O_cluster_name:,H2O_from_python_eacun_dndsro
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,7.946 Gb
H2O_cluster_total_cores:,12
H2O_cluster_allowed_cores:,12


### I- Applying Logistic Regression to   Vehicle dataset  using  h20

In [9]:
vehicle = h2o.import_file("https://academic.uprm.edu/eacuna/vehicle.csv")
predictors=["COMPACTNESS","CIRCULARITY","DISTANCE_CIRCULARITY","RADIUS_RATIO","PR.AXIS_ASPECT_RATIO","MAX.LENGTH_ASPECT_RATIO","SCATTER_RATIO","ELONGATEDNESS","PR.AXIS_RECTANGULARITY","MAX.LENGTH_RECTANGULARITY","SCALED_VARIANCE_MAJOR","SCALED_VARIANCE_MINOR","SCALED_RADIUS_OF_GYRATION","SKEWNESS_ABOUT_MAJOR","SKEWNESS_ABOUT_MINOR","KURTOSIS_ABOUT_MAJOR","KURTOSIS_ABOUT_MINOR","HOLLOWS_RATIO"]
vehicle['Class']=vehicle['Class'].asfactor()
response_col="Class"
# Create test/train split
#train, test = vehicle.split_frame(ratios=[0.75], seed=1)
glm_model = H2OGeneralizedLinearEstimator(family= "multinomial", lambda_ = 0)
glm_model.train(predictors, response_col, training_frame= vehicle)
glm_model
glm_model.confusion_matrix(vehicle)

Parse progress: |█████████████████████████████████████████████████████████| 100%
glm Model Build progress: |███████████████████████████████████████████████| 100%

Confusion Matrix: Row labels: Actual class; Column labels: Predicted class


Unnamed: 0,bus,opel,saab,van,Error,Rate
0,214.0,1.0,2.0,1.0,0.018349,4 / 218
1,1.0,142.0,67.0,2.0,0.330189,70 / 212
2,4.0,57.0,154.0,2.0,0.290323,63 / 217
3,1.0,2.0,0.0,196.0,0.015075,3 / 199
4,220.0,202.0,223.0,201.0,0.165485,140 / 846




In [10]:
#Hallando la tasa de prediccion
y_pred=glm_model.predict(vehicle)
print((y_pred['predict']==vehicle['Class']).sum()/len(vehicle))

glm prediction progress: |████████████████████████████████████████████████| 100%
0.83451536643026


### II Logistic regression for vehicle using scikit learn

In [11]:
df1=pd.read_csv("http://academic.uprm.edu/eacuna/vehicle.csv")
#Convirtiendo en matriz la tabla de predictoras y la columna de clases
y=df1['Class']
X=df1.iloc[:,0:18]
#Haciendo la regresion logistica y calculando su precision
model = LogisticRegression(solver="lbfgs",max_iter=1000)
model.fit(X,y)
model.score(X,y)

0.8274231678486997

In [12]:
predictions = model.predict(X)
print(classification_report(y, predictions))

              precision    recall  f1-score   support

         bus       0.96      0.97      0.97       218
        opel       0.71      0.66      0.68       212
        saab       0.69      0.71      0.70       217
         van       0.95      0.98      0.97       199

    accuracy                           0.83       846
   macro avg       0.83      0.83      0.83       846
weighted avg       0.83      0.83      0.83       846



### III. Logistic Regression for Landsat using scikit-learn

In [13]:
#Cargando el conjunto de datos Landsat
url='http://academic.uprm.edu/eacuna/landsat.txt'
data = pd.read_csv(url, header=None,delim_whitespace=True)
y=data.iloc[:,36]
X=data.iloc[:,0:36]
#Haciendo la regresion logistica y calculando su precision
model = LogisticRegression(solver="lbfgs", max_iter=1000)
model = model.fit(X, y)
model.score(X,y)

0.826381059751973

In [14]:
predictions = model.predict(X)
print(classification_report(y, predictions))

              precision    recall  f1-score   support

           1       0.98      0.98      0.98      1072
           2       0.96      0.92      0.94       479
           3       0.72      0.90      0.80       961
           4       0.54      0.14      0.23       415
           5       0.80      0.80      0.80       470
           6       0.78      0.84      0.81      1038

    accuracy                           0.83      4435
   macro avg       0.80      0.76      0.76      4435
weighted avg       0.81      0.83      0.81      4435



### IV Logistic Regression for  Landsat using h20

In [15]:
landsat = h2o.import_file("https://academic.uprm.edu/eacuna/landsat.txt")
predictors=['C1','C2','C3','C4','C5','C6','C7','C8','C9','C10','C11','C12','C13',
            'C14','C15','C16','C17','C18','C19','C20','C21','C22','C23','C24','C25','C26','C27',
           'C28','C29', 'C30','C31','C32','C33','C34','C35',"C36"]
landsat['C37']=landsat['C37'].asfactor()
response_col="C37"


Parse progress: |█████████████████████████████████████████████████████████| 100%


In [16]:
glm_model = H2OGeneralizedLinearEstimator(family= "multinomial", lambda_ = 0)
glm_model.train(predictors, response_col, training_frame= landsat)
glm_model
glm_model.confusion_matrix(landsat)

glm Model Build progress: |███████████████████████████████████████████████| 100%

Confusion Matrix: Row labels: Actual class; Column labels: Predicted class


Unnamed: 0,1,2,3,4,5,6,Error,Rate
0,1049.0,0.0,14.0,2.0,7.0,0.0,0.021455,"23 / 1,072"
1,0.0,462.0,0.0,0.0,17.0,0.0,0.035491,17 / 479
2,5.0,0.0,905.0,49.0,0.0,2.0,0.058273,56 / 961
3,6.0,2.0,90.0,172.0,8.0,137.0,0.585542,243 / 415
4,15.0,11.0,0.0,4.0,392.0,48.0,0.165957,78 / 470
5,0.0,0.0,13.0,69.0,30.0,926.0,0.1079,"112 / 1,038"
6,1075.0,475.0,1022.0,296.0,454.0,1113.0,0.119278,"529 / 4,435"




In [17]:
#Hallando la tasa de prediccion
y_pred=glm_model.predict(landsat)
print((y_pred['predict']==landsat['C37']).sum()/len(landsat))

glm prediction progress: |████████████████████████████████████████████████| 100%
0.8807215332581736
