In [419]:
import pymysql
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [420]:
def create_dummies(dataset,columns):
    
    for column in columns:
        dummy=pd.get_dummies(dataset[column],prefix=column)

        dataset=dataset.drop(column,axis=1)#axis=1-->para eliminar columna y no fila

        dataset=pd.concat([dataset,dummy],axis=1)#axis=1-->para agregar columna y no fila

    return dataset

In [421]:
def normalize_columns(dataset,columns):
    
    scaler = MinMaxScaler()
    
    scaled_X = scaler.fit_transform(dataset[columns])
    
    X_normalized=pd.DataFrame(scaled_X,columns=columns)
    
    dataset=dataset.drop(columns,axis=1)#axis=1-->para eliminar columna y no fila
    
    new_dataset=pd.concat([dataset,X_normalized],axis=1)#axis=1-->para agregar columna y no fila
    return new_dataset

In [422]:
try:
    conn=pymysql.connect(host='localhost', user='root', passwd='', db='CustomerInfo')
    cur=conn.cursor()
    query="SELECT Id_cliente,Macro_sector,Sector,Subsector,Actividad,Ventas,Activo_fijo,Potencial,Cheques,Etapa,Subetapa,Monto,Producto from nbo_model;"
    cur.execute(query)
    res = cur.fetchall()
    cur.close()
    conn.close()
    
except pymysql.Error as e:
    msj= ("Error %d: %s" % (e.args[0], e.args[1]))
    print(msj)
else:
    headers=['id','Macro_sector','Sector','Subsector','Actividad','Ventas','Activo_fijo','Potencial','Cheques','Etapa','Subetapa','Monto','Producto']
    dataset_dummy={}
    filas=0

    for h in headers:
        dataset_dummy[h]=[]

    if(len(res)>0):
        for r in res:
            filas+=1
            for i in range(len(headers)):
                dataset_dummy[headers[i]].append(r[i])

    print('El numero de filas de este dataset es de:'+str(filas))

    df1=pd.DataFrame(dataset_dummy) 
    print(df1.shape)

    print(df1.dtypes)

El numero de filas de este dataset es de:30069
(30069, 13)
id               object
Macro_sector     object
Sector           object
Subsector        object
Actividad        object
Ventas          float64
Activo_fijo     float64
Potencial       float64
Cheques          object
Etapa            object
Subetapa         object
Monto           float64
Producto         object
dtype: object


## Analisis de datos

In [423]:
#df2=create_dummies(df1,['Etapa'])

In [424]:
df2=df1#[df1['Subetapa']=='N']
print(df2.shape)
#df2.head()

(30069, 13)


In [425]:
Y=df2[['Monto']]
#columns=['Ventas','Activo_fijo','Monto2','Potencial']
#X=df2[columns]
X=df2[['Macro_sector','Sector','Subsector','Actividad','Ventas','Activo_fijo','Potencial']]
#X=df2[['Macro_sector','Sector','Subsector','Actividad']]
#X=df2[['Ventas','Activo_fijo','Potencial']]
#X['Cheques']=X['Cheques'].fillna(0)
indexes_empties_x=X[pd.isnull(X).any(axis=1)].index.tolist()
indexes_empties_y=Y[pd.isnull(Y).any(axis=1)].index.tolist()
X=X.dropna(axis=0,how="any")
X=X.reset_index(drop=True)
Y=Y.drop(indexes_empties_x)
Y= Y.reset_index(drop=True)

Y=Y.dropna(axis=0,how="any")
Y=Y.reset_index(drop=True)
X=X.drop(indexes_empties_y)
X=X.reset_index(drop=True)

X.shape

(29991, 7)

In [426]:
X_normalized=normalize_columns(X,['Ventas','Activo_fijo','Potencial'])
X_normalized.shape

(29991, 7)

In [427]:
#X_normalized=create_dummies(X,columns)
X_normalized=create_dummies(X_normalized,['Macro_sector','Sector','Subsector','Actividad'])
X_normalized.shape

(29991, 1020)

In [428]:
#categoricas=['Macro_sector','Sector','Subsector','Actividad']
#for c in categoricas:
#    X[c], _ = pd.factorize(X[c])
#X.dtypes 

In [429]:
from sklearn.model_selection import train_test_split

In [430]:
X_train,X_test,Y_train,Y_test=train_test_split(X_normalized,Y.values.ravel(),test_size=.2)

In [431]:
from math import sqrt
from sklearn.metrics import mean_squared_error 

## Regresion Lineal

from sklearn.linear_model import LinearRegression

lr=LinearRegression()

lr.fit(X_train,Y_train)

scorel=lr.score(X_test,Y_test)

predictl=lr.predict(X_test)

errorl = sqrt(mean_squared_error(Y_test,predictl))

## Forest

In [412]:
%%time

from sklearn.ensemble import RandomForestRegressor

forest=RandomForestRegressor(n_jobs=2, n_estimators=20)

forest.fit(X_train,Y_train)

scoref=forest.score(X_test,Y_test)

predictf=forest.predict(X_test)

errorf = sqrt(mean_squared_error(Y_test,predictf))

CPU times: user 47.3 s, sys: 144 ms, total: 47.4 s
Wall time: 24.8 s


## SVM

%%time

from sklearn.svm import SVR
from sklearn.svm import SVC

C=1e3
svc_rbf = SVR(kernel="rbf", C=C, gamma=0.1) #kernel radial

model= svc_rbf.fit(X_train,Y_train)

scores=model.score(X_test,Y_test)

predicts=model.predict(X_test)

errors = sqrt(mean_squared_error(Y_test,predicts))

## KNN

In [413]:
%%time

from sklearn.neighbors import KNeighborsRegressor

knn = KNeighborsRegressor(n_jobs=2,n_neighbors =20)

knn.fit(X_train, Y_train)

scorek=knn.score(X_test,Y_test)

predictknn=knn.predict(X_test)

errork = sqrt(mean_squared_error(Y_test,predictknn))

#predictknn2=knn.predict_proba(X_test)

CPU times: user 4min 13s, sys: 344 ms, total: 4min 13s
Wall time: 2min 10s


## Conclusion

In [414]:
#----------------------------

#print('>> LR.... <<')
#print('Score',scorel)
#print('Error',errorl)
#print(list(zip(predictl, Y_test))[:5])

#----------------------------

print('>> Forest.... <<')
print('Score',scoref)
print('Error',errorf)
print(list(zip(predictf, Y_test))[:5])
#----------------------------

#print('>> SVR.... <<')
#print('Score',scores)
#print('Error',errors)
#print(list(zip(predicts, Y_test))[:5])

#----------------------------

print('>> KNN.... <<')
print('Score',scorek)
print('Error',errork)
print(list(zip(predictknn, Y_test))[:5])
  

>> Forest.... <<
Score -1289024488731.8706
Error 368579566384665.7
[(45923954.97079341, 12000000.0), (4160594.1088416777, 200000.0), (651935.9863378117, 525365.81), (9861313.911880737, 1126746.951), (4669542.682587242, 700000.0)]
>> KNN.... <<
Score -79084214060.17581
Error 91294701337954.27
[(44534550.509609744, 12000000.0), (4176574.1504999995, 200000.0), (320196.212, 525365.81), (14897086.473072901, 1126746.951), (2365544.2985, 700000.0)]


## Obtener mejores variables

In [101]:
%%time

from sklearn.feature_selection import RFECV
from sklearn.linear_model import LinearRegression

rfe=RFECV(lr)
rfe.fit(X_normalized,Y)

CPU times: user 1h 1min 4s, sys: 3min 49s, total: 1h 4min 54s
Wall time: 32min 38s


In [102]:
best_variables=[]

for a, b in zip(X_normalized.columns.values.tolist(), rfe.support_):
    if (b==True):
        best_variables.append(a)
        
print(best_variables)        

['Ventas', 'Activo_fijo', 'Monto', 'Potencial', 'Macro_sector_', 'Macro_sector_1', 'Macro_sector_2', 'Macro_sector_3', 'Macro_sector_4', 'Macro_sector_5', 'Macro_sector_6', 'Macro_sector_7', 'Macro_sector_8', 'Sector_', 'Sector_0100008', 'Sector_0200006', 'Sector_0300004', 'Sector_0400002', 'Sector_1100007', 'Sector_1200005', 'Sector_1300003', 'Sector_1400001', 'Sector_2000008', 'Sector_2100006', 'Sector_2300002', 'Sector_2400000', 'Sector_2500008', 'Sector_2600006', 'Sector_2700004', 'Sector_2800002', 'Sector_2900000', 'Sector_3000007', 'Sector_3100005', 'Sector_3200003', 'Sector_3300001', 'Sector_3400009', 'Sector_3500007', 'Sector_3600005', 'Sector_3700003', 'Sector_3800001', 'Sector_3900009', 'Sector_4100004', 'Sector_4200002', 'Sector_5000005', 'Sector_6100002', 'Sector_6200000', 'Sector_6300008', 'Sector_6400006', 'Sector_6500004', 'Sector_6600002', 'Sector_6700000', 'Sector_6800001', 'Sector_6800002', 'Sector_6800003', 'Sector_6800008', 'Sector_6800010', 'Sector_6900003', 'Secto

## Guardar Modelo

In [398]:
import pickle

filename = 'credito2_knn_avar.sav'
pickle.dump(forest, open(filename, 'wb'))

## Cargar Modelo

In [437]:
import pickle

filename = 'credito1_forest_avar.sav'

model = pickle.load(open(filename, 'rb'))

score=model.score(X_test,Y_test)

predict=model.predict(X_test)

error = sqrt(mean_squared_error(Y_test,predict))

In [435]:
print('Score',score)
print('Error',error)
print(list(zip(predict, Y_test))[:5])

Score -0.6963398581210996
Error 45348599.95318206
[(46321416.10306225, 9070580.2), (6088117.202465617, 1000000.0), (2546637.5255748536, 5000000.0), (14022063.647354279, 7500000.0), (8945000.000000002, 25000000.0)]
