In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
import plotly.express as px

In [None]:
def plotear_scatter(X, cat_color, size = 3):
    X['score'] = X['score'].fillna(0)
    X['dummy_column_for_size'] = X['score']*20
    fig = px.scatter(X, 
                     x="tsne_x", 
                     y="tsne_y", 
                     color=cat_color, 
                     hover_data=['Alquiler_rel', 'Alquiler temporal_rel', 'leads_count', 'precision_busqueda', 'precision_barriofav', 'hits_count', 'ctr'],
                     size = 'dummy_column_for_size',
                     size_max=size)
    fig.show()

In [None]:
train_tsne = False
train_tree = False
save_output = False
portal = 'ZPAR'
dt_file_name = 'decision_tree.pkl'
data_file = '../../Data/recscores/recscores_ZPAR_2022-03-20.csv'
data_with_embeddings = 'data_{}.pkl'.format(portal)
decision_tree_depth = 4
number_of_clusters = 8

In [None]:
df = pd.read_csv(data_file, index_col = 0)
print("Cargado {}".format(data_file))
for col in df.columns:
    if df[col].dtype == 'O':
        df[col] = df[col].fillna('Desconocido')
    else:
        df[col] = df[col].fillna(0)

In [None]:
# Saco los leads_count que son '-'
df.leads_count = df.leads_count.astype(float).astype(int)

In [None]:
df['dummy_column_for_size'] = 1

In [None]:
df.hits_count = df.hits_count.astype(str)
df['hits_count']=df.hits_count.str.replace(',', '')
df['hits_count']= df['hits_count'].astype('int')

In [None]:
df['ctr'] = df['leads_count']/df['hits_count']
df['ctr'] = np.sqrt(df['ctr'].apply(lambda x: min(x,1))) # Techo en 1

In [None]:
features = ['Alquiler_rel', 'Alquiler temporal_rel', 'ctr', 'precision_operacionfav', 'precision_precio', 'precision_barriofav']

In [None]:
df['Train'] = np.random.random(len(df))<0.8

test = df[df['Train']==False]
train = df[df['Train']]

In [None]:
X = train[features]

In [None]:
if train_tsne:
    print("Entrenando TSNE para {} usuarios".format(len(X)))
    X_embedded = TSNE(n_components=2, init='random', perplexity = 500, random_state = 3).fit_transform(X)
    X = pd.concat([train.reset_index(drop=True), pd.DataFrame(X_embedded)], axis=1)
    X.rename(columns={0:'tsne_x', 1:'tsne_y'}, inplace = True)
else:
    print("Usando datos guardados en {}".format(data_with_embeddings))
    X = pd.read_pickle(data_with_embeddings)

In [None]:
for cat in features + ["operacion_favorita"]:
    plotear_scatter(X, cat, size=4)

### K Means

In [None]:
import pandas as pd

In [None]:
features = ['Alquiler_rel', 'Alquiler temporal_rel', 'ctr', 'precision_operacionfav', 'precision_precio', 'precision_barriofav']

In [None]:
from sklearn.cluster import KMeans

%time kmeans = KMeans(n_clusters=number_of_clusters, random_state=0, n_init = 50).fit(X[features])

In [None]:
X['label_Km'] = kmeans.labels_

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
#import plotly.express as px
import pickle

In [None]:
def save_model(model, file_name):
    with open(file_name, 'wb') as file:
        pickle.dump(model, file) 
        
def load_model(file_name):
    with open(file_name, 'rb') as file:
        model = pickle.load(file)
        return model

In [None]:
if train_tree:
    dtree = DecisionTreeClassifier(max_depth = decision_tree_depth, min_samples_leaf = 100)
    dtree = dtree.fit(X[features], X['label_Km'])
    save_model(dtree, dt_file_name)
    print('Arbol de decision guardado en {}'.format(dt_file_name))
else:
    dtree = load_model(dt_file_name)
    print('Arbol de decision cargado de {}'.format(dt_file_name))

X['pred_label_km'] = dtree.predict(X[features])
X['pred_label_km'] = X['pred_label_km'].astype(str)
X['label_Km'] = X['label_Km'].astype(str)

for i in ["label_Km", "pred_label_km"]:
    plotear_scatter(X, i, size = 5)

In [None]:
import graphviz
# DOT data
dot_data = tree.export_graphviz(dtree,
                                out_file=None,
                                feature_names=features,
                                class_names=[str(i) for i in range(10)], filled = True)

# Draw graph
graph = graphviz.Source(dot_data, format="png")
graph.save('decision_tree.dot')
graph

In [None]:
import pydot

(_,) = pydot.graph_from_dot_file('decision_tree.dot')
_.write_png('decision_tree.png')

In [None]:
X[features+['pred_label_km']].groupby('pred_label_km').agg(['mean', 'count'])

In [None]:
test['pred_label_km'] = dtree.predict(test[features])
test['pred_label_km'] = test['pred_label_km'].astype(str)

train['pred_label_km'] = dtree.predict(train[features])
train['pred_label_km'] = train['pred_label_km'].astype(str)

In [None]:
test[features+['pred_label_km']].groupby('pred_label_km').agg(['mean', 'count'])

In [None]:
test['label_Km'] = kmeans.predict(test[features])
train['label_Km'] = kmeans.predict(train[features])

In [None]:
test['Correct'] = (test['label_Km']==test['pred_label_km'].astype('int'))
train['Correct'] = (train['label_Km']==train['pred_label_km'].astype('int'))

In [None]:
train.groupby('label_Km')['Correct'].agg(['mean', 'count'])

In [None]:
test.groupby('label_Km')['Correct'].agg(['mean', 'count'])

In [None]:
if save_output:
    print("Grabando datos")
    X.to_pickle("./{}.pkl".format(data_with_embeddings))
else:
    print("Los datos no se grabaron")

In [None]:
X