### Clasificación utilizando scikit-learn (con pandas)

In [0]:
import csv
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

In [0]:
from google.colab import files
uploaded = files.upload()

In [0]:
# Leemos Cities.csv y creamos un dataframe
# Agregamos la columna temperature para categorizarlas
# Nota: Para un dataframe D y posición i, D.loc[i] significa la fila i del dataframe D
f = open('Cities.csv','rU')
cities = pd.read_csv(f)
cats = []
for i in range(len(cities)):
    if cities.loc[i]['temperature'] < 5:
        cats.append('cold')
    elif cities.loc[i]['temperature'] < 9:
        cats.append('cool')
    elif cities.loc[i]['temperature'] < 15:
        cats.append('warm')
    else: cats.append('hot')
cities['category'] = cats
print "cold:", len(cities[(cities.category == 'cold')])
print "cool:", len(cities[(cities.category == 'cool')])
print "warm:", len(cities[(cities.category == 'warm')])
print "hot:", len(cities[(cities.category == 'hot')])

In [0]:
# Creamos los sets para training y testing
numitems = len(cities)
percenttrain = 0.85
numtrain = int(numitems*percenttrain)
numtest = numitems - numtrain
print 'Training set', numtrain, 'items'
print'Test set', numtest, 'items'
citiesTrain = cities[0:numtrain]
citiesTest = cities[numtrain:]

citiesTest

### K-nearest-neighbors classification

In [0]:
# Predecir la categoría de temperature en base a otros atributos (features)
features = ['longitude', 'latitude']
neighbors = 10 # probar cambiando a 5
classifier = KNeighborsClassifier(neighbors)
classifier.fit(citiesTrain[features], citiesTrain['category'])
predictions = classifier.predict(citiesTest[features])
# Calculamos la precisión (accuracy)
numtrain = len(citiesTrain)
numtest = len(citiesTest)
correct = 0
for i in range(numtest):
    print citiesTest.loc[numtrain+i]['city'], ' Predicted:', predictions[i], ' Actual:', citiesTest.loc[numtrain+i]['category']
    if predictions[i] == citiesTest.loc[numtrain+i]['category']: correct +=1
print 'Accuracy:', float(correct)/float(numtest)
# Puedes probar cambian el valor de neighbors y también otros features

## Decision tree classification

In [0]:
# Predecir la categoría de temperature en base a otros atributos (features)
features = ['longitude','latitude']
split = 10 #probar con 23
dt = DecisionTreeClassifier(random_state=0, min_samples_split=split) # parameter is optional
dt.fit(citiesTrain[features], citiesTrain['category'])
predictions = dt.predict(citiesTest[features])
# Calculamos la precisión (accuracy)
numtrain = len(citiesTrain)
numtest = len(citiesTest)
correct = 0
for i in range(numtest):
#    print 'Predicted:', predictions[i], ' Actual:', citiesTest.loc[numtrain+i]['category']
    if predictions[i] == citiesTest.loc[numtrain+i]['category']: correct +=1
print 'Accuracy:', float(correct)/float(numtest)
# Puedes probar cambian el valor split y también otros features

## "Forest" of decision trees

In [0]:
# Predecir la categoría de temperature en base a otros atributos (features)
features = ['longitude', 'latitude']
split = 10
trees = 10 # probar con 1
rf = RandomForestClassifier(random_state=0, min_samples_split=split, n_estimators=trees)
rf.fit(citiesTrain[features], citiesTrain['category'])
predictions = rf.predict(citiesTest[features])
# Calculamos la precisión (accuracy)
numtrain = len(citiesTrain)
numtest = len(citiesTest)
correct = 0
for i in range(numtest):
#    print 'Predicted:', predictions[i], ' Actual:', citiesTest.loc[numtrain+i]['category']
    if predictions[i] == citiesTest.loc[numtrain+i]['category']: correct +=1
print 'Accuracy:', float(correct)/float(numtest)
# Puedes probar cambian el valor split y trees y también otros features

### Naive Bayes classification

In [0]:
# Predecir la categoría de temperature en base a otros atributos (features)
features = ['longitude', 'latitude']
nb = GaussianNB()
nb.fit(citiesTrain[features], citiesTrain['category'])
predictions = nb.predict(citiesTest[features])
# Calculamos la precisión (accuracy)
numtrain = len(citiesTrain)
numtest = len(citiesTest)
correct = 0
for i in range(numtest):
#    print 'Predicted:', predictions[i], ' Actual:', citiesTest.loc[numtrain+i]['category']
    if predictions[i] == citiesTest.loc[numtrain+i]['category']: correct +=1
print 'Accuracy:', float(correct)/float(numtest)
# # Puedes probar con otros features