In [47]:
import pymysql
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [48]:
def create_dummies(dataset,columns):
    
    for column in columns:
        dummy=pd.get_dummies(dataset[column],prefix=column)

        dataset=dataset.drop(column,axis=1)#axis=1-->para eliminar columna y no fila

        dataset=pd.concat([dataset,dummy],axis=1)#axis=1-->para agregar columna y no fila

    return dataset

In [49]:
def normalize_columns(dataset,columns):
    
    scaler = MinMaxScaler()
    
    scaled_X = scaler.fit_transform(dataset[columns])
    
    X_normalized=pd.DataFrame(scaled_X,columns=columns)
    
    dataset=dataset.drop(columns,axis=1)#axis=1-->para eliminar columna y no fila
    
    new_dataset=pd.concat([dataset,X_normalized],axis=1)#axis=1-->para agregar columna y no fila
    return new_dataset

In [50]:
try:
    conn=pymysql.connect(host='localhost', user='root', passwd='', db='unifin')
    cur=conn.cursor()
    query="SELECT Id_cliente,Macro_sector,Sector,Subsector,Actividad,Ventas,Activo_fijo,Potencial,Cheques,Etapa,Producto from nbo_model;"
    cur.execute(query)
    res = cur.fetchall()
    cur.close()
    conn.close()
    
except pymysql.Error as e:
    msj= ("Error %d: %s" % (e.args[0], e.args[1]))
    print(msj)
else:
    headers=['id','Macro_sector','Sector','Subsector','Actividad','Ventas','Activo_fijo','Potencial','Cheques','Etapa','Producto']
    dataset_dummy={}
    filas=0

    for h in headers:
        dataset_dummy[h]=[]

    if(len(res)>0):
        for r in res:
            filas+=1
            for i in range(len(headers)):
                dataset_dummy[headers[i]].append(r[i])

    print('El numero de filas de este dataset es de:'+str(filas))

    df1=pd.DataFrame(dataset_dummy) 
    print(df1.shape)

    print(df1.dtypes)

El numero de filas de este dataset es de:30069
(30069, 11)
id               object
Macro_sector     object
Sector           object
Subsector        object
Actividad        object
Ventas          float64
Activo_fijo     float64
Potencial       float64
Cheques          object
Etapa            object
Producto         object
dtype: object


## Analisis de datos

In [51]:
df2=create_dummies(df1,['Etapa'])

In [52]:
Y=df2[['Etapa_R']]
X=df2[['Macro_sector','Sector','Subsector','Actividad','Ventas','Activo_fijo','Potencial']]
X.shape

(30069, 7)

In [53]:
#X['Cheques']=X['Cheques'].fillna(0)
indexes_empties=X[pd.isnull(X).any(axis=1)].index.tolist()
X=X.dropna(axis=0,how="any")
X= X.reset_index(drop=True)
Y=Y.drop(indexes_empties)
Y= Y.reset_index(drop=True)
X.shape

(30001, 7)

In [54]:
X_normalized=normalize_columns(X,['Ventas','Activo_fijo','Potencial'])
X_normalized.shape

(30001, 7)

In [55]:
#X_normalized=create_dummies(X_normalized,['Macro_sector','Sector','Subsector','Actividad'])
#X_normalized.shape

In [56]:
categoricas=['Macro_sector','Sector','Subsector','Actividad']
for c in categoricas:
    X_normalized[c], _ = pd.factorize(X_normalized[c])
X_normalized.shape

(30001, 7)

In [57]:
from sklearn.model_selection import train_test_split

In [58]:
X_train,X_test,Y_train,Y_test=train_test_split(X_normalized,Y.values.ravel(),test_size=.2)

## Regresion Logistica

In [59]:
from sklearn.linear_model import LogisticRegression

lr=LogisticRegression()

lr.fit(X_train,Y_train)

scorel=lr.score(X_test,Y_test)

predictl=lr.predict(X_test)

## Forest

In [60]:
%%time

from sklearn.ensemble import RandomForestClassifier

forest=RandomForestClassifier(n_jobs=2)

forest.fit(X_train,Y_train)

scoref=forest.score(X_test,Y_test)

predictf=forest.predict(X_test)

CPU times: user 401 ms, sys: 15.9 ms, total: 417 ms
Wall time: 428 ms


## SVM

In [61]:
%%time

from sklearn.svm import SVR
from sklearn.svm import SVC

C=1e3
svc_rbf = SVC(kernel="rbf", C=C, gamma=0.1) #kernel radial

model= svc_rbf.fit(X_train,Y_train)

scores=model.score(X_test,Y_test)

predicts=model.predict(X_test)

CPU times: user 12.2 s, sys: 63.4 ms, total: 12.2 s
Wall time: 12.2 s


## KNN

In [62]:
%%time

from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()

knn.fit(X_train, Y_train)

scorek=knn.score(X_test,Y_test)

predictknn=knn.predict(X_test)

#predictknn2=knn.predict_proba(X_test)

CPU times: user 334 ms, sys: 3.93 ms, total: 338 ms
Wall time: 236 ms


## Conclusion

In [63]:
contone=0
contzero=0
total=len(Y_test)

for i in Y_test:
    if(i==1):
        contone=contone+1
    else:
        contzero=contzero+1        

#----------------------------

print('>> LR.... <<')
print('Score',scorel)

c=0
cont=0
cont2=0
for a, b in zip(predictl, Y_test):
    if (a == b):
        c=c+1
        if(a==1):
            cont=cont+1
        else:
            cont2=cont2+1
print(c,'Registros acertados de',total )
print(cont,'"unos" de',contone,'-->',((cont*100)/contone),'%') 
print(cont2,'"zeros" de',contzero,'-->',((cont2*100)/contzero),'%')
print('..................')
print()
  
#----------------------------

print('>> Forest.... <<')
print('Score',scoref)

c=0
cont=0
cont2=0
for a, b in zip(predictf, Y_test):
    if (a == b):
        c=c+1
        if(a==1):
            cont=cont+1
        else:
            cont2=cont2+1
print(c,'Registros acertados de',total )
print(cont,'"unos" de',contone,'-->',((cont*100)/contone),'%') 
print(cont2,'"zeros" de',contzero,'-->',((cont2*100)/contzero),'%')
print('..................')
print()

#----------------------------

print('>> SVR.... <<')
print('Score',scores)

c=0
cont=0
cont2=0
for a, b in zip(predicts, Y_test):
    if (a == b):
        c=c+1
        if(a==1):
            cont=cont+1
        else:
            cont2=cont2+1
print(c,'Registros acertados de',total )
print(cont,'"unos" de',contone,'-->',((cont*100)/contone),'%') 
print(cont2,'"zeros" de',contzero,'-->',((cont2*100)/contzero),'%')
print('..................')
print()
  
#----------------------------

print('>> KNN.... <<')
print('Score',scorek)

c=0
cont=0
cont2=0
for a, b in zip(predictknn, Y_test):
    if (a == b):
        c=c+1
        if(a==1):
            cont=cont+1
        else:
            cont2=cont2+1
print(c,'Registros acertados de',total )
print(cont,'"unos" de',contone,'-->',((cont*100)/contone),'%') 
print(cont2,'"zeros" de',contzero,'-->',((cont2*100)/contzero),'%')
print('..................')
print()
  

>> LR.... <<
Score 0.845192467922013
5072 Registros acertados de 6001
0 "unos" de 929 --> 0.0 %
5072 "zeros" de 5072 --> 100.0 %
..................

>> Forest.... <<
Score 0.8410264955840693
5047 Registros acertados de 6001
8 "unos" de 929 --> 0.8611410118406889 %
5039 "zeros" de 5072 --> 99.3493690851735 %
..................

>> SVR.... <<
Score 0.8443592734544243
5067 Registros acertados de 6001
7 "unos" de 929 --> 0.7534983853606028 %
5060 "zeros" de 5072 --> 99.76340694006309 %
..................

>> KNN.... <<
Score 0.8325279120146643
4996 Registros acertados de 6001
139 "unos" de 929 --> 14.96232508073197 %
4857 "zeros" de 5072 --> 95.76104100946372 %
..................



## Obtener mejores variables

In [44]:
%%time

from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression

lr=LogisticRegression()    
rfe=RFECV(lr)
rfe.fit(X_normalized,Y.values.ravel())

CPU times: user 2.61 s, sys: 112 ms, total: 2.72 s
Wall time: 1.46 s


In [45]:
best_variables=[]

for a, b in zip(X_normalized.columns.values.tolist(), rfe.support_):
    if (b==True):
        best_variables.append(a)
        
print(best_variables)        

['Macro_sector', 'Sector', 'Subsector', 'Actividad', 'Ventas', 'Activo_fijo', 'Potencial']


## Guardar Modelo

In [189]:
import pickle

filename = 'acreedor2.sav'
pickle.dump(knn, open(filename, 'wb'))

## Cargar Modelo

In [46]:
import pickle

filename = 'acreedor2_knn_avar.sav'

knn = pickle.load(open(filename, 'rb'))

scorek=knn.score(X_test,Y_test)

predictknn=knn.predict(X_test)


ValueError: query data dimension must match training data dimension

In [23]:
sss=knn.predict_proba(X_test)

In [22]:
predictknnctknn

array([0, 0, 0, ..., 0, 1, 0], dtype=uint8)

In [25]:
sss

array([[1. , 0. ],
       [0.6, 0.4],
       [1. , 0. ],
       ...,
       [0.8, 0.2],
       [0.2, 0.8],
       [1. , 0. ]])

In [24]:
sss[:,1]

array([0. , 0.4, 0. , ..., 0.2, 0.8, 0. ])

In [389]:
contone=0
contzero=0
total=len(Y_test)

for i in Y_test:
    if(i==1):
        contone=contone+1
    else:
        contzero=contzero+1        


#----------------------------

print('>> KNN.... <<')
print('Score',scorek)

c=0
cont=0
cont2=0
for a, b in zip(predictknn, Y_test):
    if (a == b):
        c=c+1
        if(a==1):
            cont=cont+1
        else:
            cont2=cont2+1
print(c,'Registros acertados de',total )
print(cont,'"unos" de',contone,'-->',((cont*100)/contone),'%') 
print(cont2,'"zeros" de',contzero,'-->',((cont2*100)/contzero),'%')
print('..................')
print()
  

>> KNN.... <<
Score 0.8360273287785369
5017 Registros acertados de 6001
145 "unos" de 924 --> 15.692640692640692 %
4872 "zeros" de 5077 --> 95.96218239117589 %
..................

