In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from collections import Counter

from imblearn.over_sampling import SMOTE

from sklearn.decomposition import PCA
from sklearn.naive_bayes import GaussianNB

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import VarianceThreshold
from imblearn.over_sampling import SMOTE

In [2]:
DATA='ugrin2020-vehiculo-usado-multiclase/'
TRAIN=DATA+'train.csv'
TEST=DATA+'test.csv'

PREPROCESSED_DATA='preprocessed_data/'
RESULTS='results/'

In [3]:
VARIANCETHRESHOLD=0

In [4]:
NOMBRE=DATA+'nombre.csv'
CIUDAD=DATA+'ciudad.csv'
COMBUSTIBLE=DATA+'combustible.csv'
TIPO_MARCHAS=DATA+'tipo_marchas.csv'

def encode(train, test):

    le = LabelEncoder()
    
    #train.Precio_cat=train.Precio_cat.apply(lambda x: float(x))

    # Codifico combustibles
    le.classes_=['LPG','CNG','Petrol','Diesel']
    train.Combustible = le.transform(train.Combustible)
    test.Combustible = le.transform(test.Combustible)

    le.classes_=['Manual','Automatic']
    train.Tipo_marchas = le.transform(train.Tipo_marchas)
    test.Tipo_marchas = le.transform(test.Tipo_marchas)

    # Codifico manos
    le.classes_=['First','Second','Third','Fourth & Above']
    train.Mano = le.transform(train.Mano)
    test.Mano = le.transform(test.Mano)

    # Consumo a numérica
    train.Consumo=train.Consumo.apply(lambda x: float(x.split(' ')[0]))
    test.Consumo=test.Consumo.apply(lambda x: float(x.split(' ')[0]))
    
    # CC a numérica
    train.Motor_CC=train.Motor_CC.apply(lambda x: float(x.split(' ')[0]))
    test.Motor_CC=test.Motor_CC.apply(lambda x: float(x.split(' ')[0]))

    # Potencia a numérica
    train.Potencia=train.Potencia.apply(lambda x: float(x.split(' ')[0]))
    test.Potencia=test.Potencia.apply(lambda x: float(x.split(' ')[0]))

    return train, test

# Split train label
def split(train, test):

    train_array = np.array(train)
    test_array = np.array(test)

    return train_array, test_array

def shuffle_in_unison(a, b):
    rng_state = np.random.get_state()
    np.random.shuffle(a)
    np.random.set_state(rng_state)
    np.random.shuffle(b)
    
# Normalización
def scale(train, test):

    # Estandarizamos los datos
    selector = VarianceThreshold(VARIANCETHRESHOLD) # No podemos estandarizar datos con varianza nula
    std = StandardScaler()

    selector.fit(train)
    train=selector.transform(train)
    test=selector.transform(test)

    std.fit(train)
    train=std.transform(train)
    test=std.transform(test)
    
    return train, test

In [5]:
train = pd.read_csv(TRAIN) # Cargo datos de entrenamiento
test = pd.read_csv(TEST) # Cargo datos de test

# Eliminamos el campo id ya que no se debe usar para predecir
test_ids = test['id']
del test['id']
del train['id']

# Cambiamos el nombre a la columna Año para poder manejarla correctamente
train.rename(columns = {'Año':'Anio'}, inplace = True)
test.rename(columns = {'Año':'Anio'}, inplace = True)

In [6]:
del train['Descuento']
del test['Descuento']

In [7]:
del train['Nombre']
del test['Nombre']
del train['Ciudad']
del test['Ciudad']

In [8]:
train=train[train.Combustible!='Electric']

In [9]:
train.dropna(inplace=True)

In [10]:
label=train.Precio_cat
del train['Precio_cat']

In [11]:
train, test = encode(train, test)
train, test = scale(train, test)

train, test = split(train, test)
train, label = SMOTE(random_state=25).fit_resample(train, label)

shuffle_in_unison(train, label)

In [12]:
for a in train.columns:
    if a in {'TOT_MUERTOS','TOT_HERIDOS_GRAVES'}:
        continue
    d=train[a][abs(zscore(train[a]))<3]
    train[a][zscore(train[a])<-3]=d.min()
    train[a][zscore(train[a])>3]=d.max()

AttributeError: 'numpy.ndarray' object has no attribute 'columns'

In [None]:
model=GaussianNB()

In [None]:
pca=PCA(0.8)
train=pca.fit_transform(train)
test=pca.transform(test)

In [None]:
train.shape

In [None]:
test.shape

In [None]:
scores=cross_val_score(model, train, label, cv=5)
print(scores)
print(np.mean(scores))

## Generar fichero de Kaggle

In [None]:
model.fit(train,label)
# Ahora predecimos
predict = model.predict(test)
predict = list(map(int,predict))
# Generamos 
df_result = pd.DataFrame({'id': test_ids, 'Precio_cat': predict})
df_result.to_csv(RESULTS+"try12.csv", index=False)