# Including libraries

In [1]:
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import KNeighborsRegressor
import graphlab as gl
import numpy as np

## Reading and Parsing the Data for the DataSet cinco.csv

In [2]:
table5 = gl.SFrame.read_csv(url="cinco.csv")

x = table5.to_numpy()
y = x[:, 4]
x = x[:, 0:4]

[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1488820297.log
INFO:graphlab.cython.cy_server:GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1488820297.log


This non-commercial license of GraphLab Create for academic use is assigned to clebsondm@gmail.com and will expire on December 05, 2017.
------------------------------------------------------


Inferred types from first 100 line(s) of file as 
column_type_hints=[str,str,str,str,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


### Transforming the data to numeric values

In [3]:
def convert(x):
    for i in xrange(x.shape[0]):
        for j in xrange(x.shape[1]):
            if x[i][j] in ['Ensolarado', 'Quente', 'Alta', 'Falso']:
                x[i][j] = 0
            elif x[i][j] in['Chuvoso', 'Frio', 'Normal', 'Verdadeiro']:
                x[i][j] = 1
            elif x[i][j] in ['Nublado', 'Morna']:
                x[i][j] = 2
    return x

def modify_y(y):
    for i in xrange(len(y)):
        if y[i] == 'Nao':
            y[i] = 0
        else:
            y[i] = 1
    return y

x = convert(x)
y = modify_y(y)
x = x.astype(int)
y = y.astype(int)

### Let's use the Naive Bayes to make the prediction for the sample: [Ensolarado", "Quente", "Normal", "Verdadeiro"]

In [4]:
gnb = GaussianNB()
gnb.fit(X = x, y=y)
sample_test1 = np.array([["Ensolarado", "Quente", "Normal", "Verdadeiro"]])
sample_test1 = convert(sample_test1).astype(int)
print "Predicao:", ["Sim" if x == 1 else "Nao" for x in gnb.predict(X = sample_test1)]

Predicao: ['Nao']


# Reading and Parsing the Data for the DataSet seis.csv

In [5]:
table6 = gl.SFrame.read_csv(url = "seis.csv", delimiter=";")

x6 = table6.to_numpy()
y6 = x6[:, 4]
x6 = x6[:, 0:4]

------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,str,str,str,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


In [6]:
def convert2(data):
    for i in xrange(data.shape[0]):
        for j in xrange(data.shape[1]):
            if data[i][j] in ["Nao", "Nenhum", "Tailandes"]:
                data[i][j] = 0
            elif data[i][j] in ["Sim", "Alguns", "Frances"]:
                data[i][j] = 1
            elif data[i][j] in ["Cheio", "Hamburguer"]:
                data[i][j] = 2
            elif data[i][j] in ["Italiano"]:
                data[i][j] = 3
    return data

x6 = convert2(x6)
x6 = x6.astype(int)

y6 = np.array(y6)
y6 = modify_y(y6).astype(int)

### Building a Decision Tree and a K-Nearest Neighbor classifier

In [7]:
dt = DecisionTreeClassifier("entropy")
dt.fit(X = x6, y=y6)

knc = KNeighborsClassifier(n_neighbors=3, weights="uniform", algorithm='brute')
knc.fit(X = x6, y = y6)

test_sample2 = np.array([[1, 1, 2, 1]])
test_sample3 = np.array([[1, 1, 1, 3]])

### Prediction for test samples 2 and 3

In [8]:
print "Predicao:", ["Sim" if x == 1 else "Nao" for x in dt.predict(X = test_sample2)]
print "Predicao:", ["Sim" if x == 1 else "Nao" for x in knc.predict(X = test_sample3)]

Predicao: ['Sim']
Predicao: ['Sim']


# Now Let's use regression

### Reading Data

In [9]:
table_regression = gl.SFrame.read_csv(url = "regressao.csv", delimiter=",")

### Setting up a Leave one out cross validation using graphlab create.

In [10]:
folds = gl.cross_validation.KFold(table_regression, table_regression.shape[0])
rmse = list()
for train, valid in folds:
    model = gl.regression.create(dataset=table_regression, target='Y',
                                 features=["X1", "X2", "X3", "X4", "X5"])
    evaluation = model.evaluate(valid)
    rmse.append(evaluation['rmse'])

In [11]:
print "RMSE:", np.mean(rmse)

RMSE: 6.44226150513


### Setting up the data to use DecisionTreeRegressor and K-Nearest Neighbor Regressor using sklearn.

In [12]:
x7 = table_regression.to_numpy()
y7 = x7[:, 5]
x7 = x7[:, 0:5]

### Decision Tree Regressor

In [13]:
dtr = DecisionTreeRegressor(criterion="mse", max_depth=3)
dtr.fit(X=x7, y=y7)
print "Predicao DT Regressor: ", dtr.predict(X=np.array([[245, 4, 9700, 4600, 1835]]))

Predicao DT Regressor:  [ 205.]


### K-Nearest Neighbor Regressor

In [14]:
knr = KNeighborsRegressor(n_neighbors=5, metric="euclidean")

knr.fit(X=x7, y=y7)
print "KNN Regressor: ", knr.predict(X=np.array([[245, 4, 9700, 4600, 1835]]))

KNN Regressor:  [ 128.]
