## Data Mining and Machine Learning
### Multiclass Logistic Regression
#### Libraries: H20  and scikit-learn 
#### Edgar Acuna

In [1]:
import h2o
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.estimators.naive_bayes import H2ONaiveBayesEstimator
from h2o.estimators.glm   import H2OGeneralizedLinearEstimator

h2o.connect()
h2o.no_progress()


  _nan_object_mask = _nan_object_array != _nan_object_array


Connecting to H2O server at http://localhost:54321... successful.


0,1
H2O cluster uptime:,9 days 4 hours 32 mins
H2O cluster timezone:,America/La_Paz
H2O data parsing timezone:,UTC
H2O cluster version:,3.20.0.8
H2O cluster version age:,1 month and 10 days
H2O cluster name:,H2O_started_from_R_edgar2017_gsn168
H2O cluster total nodes:,1
H2O cluster free memory:,1.578 Gb
H2O cluster total cores:,4
H2O cluster allowed cores:,4


### I- Applying Logistic Regression to   Vehicle dataset  using  h20

In [2]:
vehicle = h2o.import_file("https://academic.uprm.edu/eacuna/vehicle.csv")
predictors=['V1','V2','V3','V4','V5','V6','V7','V8','V9','V10','V11','V12','V13','V14','V15','V16','V17','V18']
vehicle['class']=vehicle['class'].asfactor()
response_col="class"
# Create test/train split
#train, test = vehicle.split_frame(ratios=[0.75], seed=1)
glm_model = H2OGeneralizedLinearEstimator(family= "multinomial", lambda_ = 0)
glm_model.train(predictors, response_col, training_frame= vehicle)
glm_model
glm_model.confusion_matrix(vehicle)



Confusion Matrix: Row labels: Actual class; Column labels: Predicted class



0,1,2,3,4,5
1.0,2.0,3.0,4.0,Error,Rate
214.0,1.0,2.0,1.0,0.0183486,4 / 218
1.0,142.0,67.0,2.0,0.3301887,70 / 212
4.0,57.0,154.0,2.0,0.2903226,63 / 217
1.0,2.0,0.0,196.0,0.0150754,3 / 199
220.0,202.0,223.0,201.0,0.1654846,140 / 846




In [3]:
#Hallando la tasa de prediccion
y_pred=glm_model.predict(vehicle)
print (y_pred['predict']==vehicle['class']).sum()/float(len(vehicle))

0.83451536643


### II Logistic regression for vehicle using scikit learn

In [4]:
df1=pd.read_csv("http://academic.uprm.edu/eacuna/vehicle.csv")
#Convirtiendo en matriz la tabla de predictoras y la columna de clases
y=df1['class']
X=df1.iloc[:,0:18]
#Haciendo la regresion logistica y calculando su precision
model = LogisticRegression(solver="lbfgs",max_iter=1000)
model.fit(X,y)
model.score(X,y)



0.81560283687943258

In [5]:
predictions = model.predict(X)
print(classification_report(y, predictions))

              precision    recall  f1-score   support

           1       0.93      0.97      0.95       218
           2       0.70      0.61      0.65       212
           3       0.67      0.71      0.69       217
           4       0.96      0.97      0.97       199

   micro avg       0.82      0.82      0.82       846
   macro avg       0.81      0.82      0.82       846
weighted avg       0.81      0.82      0.81       846



### III. Logistic Regression for Landsat using scikit-learn

In [6]:
url='http://academic.uprm.edu/eacuna/landsat.txt'
data = pd.read_table(url, header=None,delim_whitespace=True)
y=data.iloc[:,36]
X=data.iloc[:,0:36]
#Haciendo la regresion logistica y calculando su precision
model = LogisticRegression(solver="lbfgs", max_iter=1000)
model = model.fit(X, y)
model.score(X,y)

0.85028184892897407

In [7]:
predictions = model.predict(X)
print(classification_report(y, predictions))

              precision    recall  f1-score   support

           1       0.95      0.98      0.97      1072
           2       0.92      0.97      0.95       479
           3       0.83      0.96      0.89       961
           4       0.51      0.13      0.21       415
           5       0.92      0.69      0.79       470
           6       0.76      0.92      0.83      1038

   micro avg       0.85      0.85      0.85      4435
   macro avg       0.82      0.78      0.77      4435
weighted avg       0.83      0.85      0.83      4435



### IV Logistic Regression for  Landsat using h20

In [8]:
landsat = h2o.import_file("https://academic.uprm.edu/eacuna/landsat.txt")
predictors=['C1','C2','C3','C4','C5','C6','C7','C8','C9','C10','C11','C12','C13',
            'C14','C15','C16','C17','C18','C19','C20','C21','C22','C23','C24','C25','C26','C27',
           'C28','C29', 'C30','C31','C32','C33','C34','C35',"C36"]
landsat['C37']=landsat['C37'].asfactor()
response_col="C37"


In [9]:
glm_model = H2OGeneralizedLinearEstimator(family= "multinomial", lambda_ = 0)
glm_model.train(predictors, response_col, training_frame= landsat)
glm_model
glm_model.confusion_matrix(landsat)

Confusion Matrix: Row labels: Actual class; Column labels: Predicted class



0,1,2,3,4,5,6,7
1.0,2.0,3.0,4.0,5.0,6.0,Error,Rate
1049.0,0.0,14.0,2.0,7.0,0.0,0.0214552,"23 / 1,072"
0.0,462.0,0.0,0.0,17.0,0.0,0.0354906,17 / 479
5.0,0.0,905.0,49.0,0.0,2.0,0.0582726,56 / 961
6.0,2.0,90.0,172.0,8.0,137.0,0.5855422,243 / 415
15.0,11.0,0.0,4.0,392.0,48.0,0.1659574,78 / 470
0.0,0.0,13.0,69.0,30.0,926.0,0.1078998,"112 / 1,038"
1075.0,475.0,1022.0,296.0,454.0,1113.0,0.1192785,"529 / 4,435"




In [10]:
#Hallando la tasa de prediccion
y_pred=glm_model.predict(landsat)
print (y_pred['predict']==landsat['C37']).sum()/float(len(landsat))

0.880721533258
