In [11]:
import pandas as pd
import numpy as np

x = pd.read_csv("data/dengue_features_train.csv")
y = pd.read_csv("data/dengue_labels_train.csv")
data = x.merge(y, on=("city", "year", "weekofyear")).sort_values(by=["year", "weekofyear"])
data_sj = data[data.city=="sj"]
data_sj = data_sj.drop(labels=["week_start_date","city"], axis=1)
data_iq = data[data.city=="iq"]
data_iq = data_iq.drop(labels=["week_start_date","city"], axis=1)

In [12]:
def f(x):
    for i in np.arange(0,90,10):
        if x < i:
            return "%d-%d"%(i-10,i)
    return ">80"

data_sj.total_cases = data_sj.total_cases.apply(f)
data_sj.groupby("total_cases").weekofyear.count().sort_index()

total_cases
0-10     242
10-20    235
20-30    141
30-40    100
40-50     49
50-60     37
60-70     31
70-80     27
>80       74
Name: weekofyear, dtype: int64

In [13]:
from copy import deepcopy
from sklearn.preprocessing import StandardScaler

def input_missing_values(x_array):
    _x_array = deepcopy(x_array)
    size = len(_x_array)
    for i, x in enumerate(_x_array):
        for j, f_value in enumerate(x):
            if np.isnan(f_value):
                i_before = i
                i_after = i
                while np.isnan(_x_array[i_before, j]):
                    i_before -= 1
                    if i_before == -1:
                        break
                while np.isnan(_x_array[i_after, j]):
                    i_after += 1
                    if i_after == size:
                        break
                if i_before == -1 and i_after != size:
                    _x_array[i,j] = _x_array[i_after,j]
                elif i_before != -1 and i_after == size:
                    _x_array[i,j] = _x_array[i_before,j]
                elif i_before != -1:
                    _x_array[i,j] = (_x_array[i_before,j] + _x_array[i_after,j])/2
                else:
                    print("Nao ha valores para inputar na coluna de numero %d"%j)
                    return None
    return _x_array

def preprocess_data(data, columns="all", train_size=0.8):
    if columns != "all":
        data = data[columns+["total_cases"]]
    x = data.values[:,:-1]
    x = input_missing_values(x)
    y = data.values[:,-1]
    scaler = StandardScaler()
    x_std = scaler.fit_transform(x)
    train_size = int(train_size * len(x_std))
    x_train = x_std[:train_size,:]
    x_test = x_std[train_size:,:]
    y_train = y[:train_size]
    y_test = y[train_size:]
    return x_train, x_test, y_train, y_test

In [14]:
cols = [
        "year",
        "reanalysis_specific_humidity_g_per_kg",
        "reanalysis_dew_point_temp_k",
        "station_max_temp_c",
        "reanalysis_tdtr_k",
        "weekofyear"
       ]

x_train_sj, x_test_sj, y_train_sj, y_test_sj = preprocess_data(data_sj, columns="all", train_size=0.8)



In [30]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, mean_absolute_error

rf = RandomForestClassifier(random_state=1, max_depth=7)
rf.fit(x_train_sj, y_train_sj)
y_pred = rf.predict(x_train_sj)
print("Treino: %f"%accuracy_score(y_train_sj, y_pred))
y_pred = rf.predict(x_test_sj)
print("Teste: %f"%accuracy_score(y_test_sj, y_pred))

Treino: 0.750000
Teste: 0.356383


In [40]:
def f(x):
    if ">" in x:
        return int(x[1:])
    else:
        if x[0] != "0":
            return (int(x[:2]) + int(x[3:]))/2
        else:
            return (int(x[:1]) + int(x[2:]))/2
mean_absolute_error(list(map(f,y_test_sj)), list(map(f,y_pred)))

15.452127659574469

In [16]:
from sklearn.neighbors import KNeighborsClassifier

kn = KNeighborsClassifier()
kn.fit(x_train_sj, y_train_sj)
y_pred = kn.predict(x_train_sj)
print("Treino: %f"%accuracy_score(y_train_sj, y_pred))
y_pred = kn.predict(x_test_sj)
print("Teste: %f"%accuracy_score(y_test_sj, y_pred))

Treino: 0.509358
Teste: 0.303191


In [17]:
from sklearn.svm import SVC

svc = SVC()
svc.fit(x_train_sj, y_train_sj)
y_pred = svc.predict(x_train_sj)
print("Treino: %f"%accuracy_score(y_train_sj, y_pred))
y_pred = svc.predict(x_test_sj)
print("Teste: %f"%accuracy_score(y_test_sj, y_pred))

Treino: 0.483957
Teste: 0.308511


In [18]:
from sklearn.gaussian_process import GaussianProcessClassifier

gc = GaussianProcessClassifier()
gc.fit(x_train_sj, y_train_sj)
y_pred = gc.predict(x_train_sj)
print("Treino: %f"%accuracy_score(y_train_sj, y_pred))
y_pred = gc.predict(x_test_sj)
print("Teste: %f"%accuracy_score(y_test_sj, y_pred))

KeyboardInterrupt: 