In [8]:
import pymysql
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [9]:
def create_dummies(dataset,columns):
    
    for column in columns:
        dummy=pd.get_dummies(dataset[column],prefix=column)

        dataset=dataset.drop(column,axis=1)#axis=1-->para eliminar columna y no fila

        dataset=pd.concat([dataset,dummy],axis=1)#axis=1-->para agregar columna y no fila

    return dataset

In [10]:
def normalize_columns(dataset,columns):
    
    scaler = MinMaxScaler()
    
    scaled_X = scaler.fit_transform(dataset[columns])
    
    X_normalized=pd.DataFrame(scaled_X,columns=columns)
    
    dataset=dataset.drop(columns,axis=1)#axis=1-->para eliminar columna y no fila
    
    new_dataset=pd.concat([dataset,X_normalized],axis=1)#axis=1-->para agregar columna y no fila
    return new_dataset

In [11]:
try:
    conn=pymysql.connect(host='localhost', user='root', passwd='', db='unifin')
    cur=conn.cursor()
    query="SELECT Id_cliente,Macro_sector,Sector,Subsector,Actividad,Ventas,Activo_fijo,Potencial,Cheques,Etapa,Producto from nbo_model;"
    cur.execute(query)
    res = cur.fetchall()
    cur.close()
    conn.close()
    
except pymysql.Error as e:
    msj= ("Error %d: %s" % (e.args[0], e.args[1]))
    print(msj)
else:
    headers=['id','Macro_sector','Sector','Subsector','Actividad','Ventas','Activo_fijo','Potencial','Cheques','Etapa','Producto']
    dataset_dummy={}
    filas=0

    for h in headers:
        dataset_dummy[h]=[]

    if(len(res)>0):
        for r in res:
            filas+=1
            for i in range(len(headers)):
                dataset_dummy[headers[i]].append(r[i])

    print('El numero de filas de este dataset es de:'+str(filas))

    df1=pd.DataFrame(dataset_dummy) 
    print(df1.shape)

    print(df1.dtypes)

El numero de filas de este dataset es de:30069
(30069, 11)
id               object
Macro_sector     object
Sector           object
Subsector        object
Actividad        object
Ventas          float64
Activo_fijo     float64
Potencial       float64
Cheques          object
Etapa            object
Producto         object
dtype: object


## Analisis de datos

In [12]:
df2=create_dummies(df1,['Etapa'])

In [13]:
Y=df2[['Etapa_R']]
X=df2[['Macro_sector','Sector','Subsector','Actividad','Ventas','Activo_fijo','Potencial']]
X.shape

(30069, 7)

In [14]:
#X['Cheques']=X['Cheques'].fillna(0)
indexes_empties=X[pd.isnull(X).any(axis=1)].index.tolist()
X=X.dropna(axis=0,how="any")
X= X.reset_index(drop=True)
Y=Y.drop(indexes_empties)
Y= Y.reset_index(drop=True)
X.shape

(30001, 7)

In [15]:
X_normalized=normalize_columns(X,['Ventas','Activo_fijo','Potencial'])
X_normalized.shape

(30001, 7)

In [16]:
X_normalized=create_dummies(X_normalized,['Macro_sector','Sector','Subsector','Actividad'])
X_normalized.shape

(30001, 1020)

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
X_train,X_test,Y_train,Y_test=train_test_split(X_normalized,Y.values.ravel(),test_size=.2)

In [19]:
df1['Etapa']

0        CL
1        CL
2         P
3        CL
4         P
5         R
6        CL
7        CL
8         R
9        CL
10       CL
11       CL
12        R
13        P
14       CL
15        P
16       CL
17       CL
18       CL
19       CL
20       CL
21       CL
22       CL
23       CL
24       CL
25       CL
26       CL
27        P
28       CL
29       CL
         ..
30039    CL
30040    CL
30041    CL
30042    CL
30043     P
30044     P
30045    CL
30046    CL
30047     R
30048    CL
30049    CL
30050    CL
30051    CL
30052    CL
30053     P
30054    CL
30055     R
30056    CL
30057    CL
30058    CL
30059     R
30060    CL
30061    CL
30062    CL
30063    CL
30064     P
30065    CL
30066    CL
30067    CL
30068    CL
Name: Etapa, Length: 30069, dtype: object

## Regresion Logistica

In [13]:
from sklearn.linear_model import LogisticRegression

lr=LogisticRegression()

lr.fit(X_train,Y_train)

scorel=lr.score(X_test,Y_test)

predictl=lr.predict(X_test)

## Forest

In [14]:
%%time

from sklearn.ensemble import RandomForestClassifier

forest=RandomForestClassifier(n_jobs=2)

forest.fit(X_train,Y_train)

scoref=forest.score(X_test,Y_test)

predictf=forest.predict(X_test)

  from numpy.core.umath_tests import inner1d


CPU times: user 3.16 s, sys: 112 ms, total: 3.27 s
Wall time: 1.8 s


## SVM

In [19]:
%%time

from sklearn.svm import SVR
from sklearn.svm import SVC

C=1e3
svc_rbf = SVC(kernel="rbf", C=C, gamma=0.1) #kernel radial

model= svc_rbf.fit(X_train,Y_train)

scores=model.score(X_test,Y_test)

predicts=model.predict(X_test)

CPU times: user 5min 30s, sys: 292 ms, total: 5min 31s
Wall time: 5min 31s


## KNN

In [16]:
%%time

from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()

knn.fit(X_train, Y_train)

scorek=knn.score(X_test,Y_test)

predictknn=knn.predict(X_test)

#predictknn2=knn.predict_proba(X_test)

CPU times: user 3min, sys: 168 ms, total: 3min
Wall time: 3min


## Conclusion

In [20]:
contone=0
contzero=0
total=len(Y_test)

for i in Y_test:
    if(i==1):
        contone=contone+1
    else:
        contzero=contzero+1        

#----------------------------

print('>> LR.... <<')
print('Score',scorel)

c=0
cont=0
cont2=0
for a, b in zip(predictl, Y_test):
    if (a == b):
        c=c+1
        if(a==1):
            cont=cont+1
        else:
            cont2=cont2+1
print(c,'Registros acertados de',total )
print(cont,'"unos" de',contone,'-->',((cont*100)/contone),'%') 
print(cont2,'"zeros" de',contzero,'-->',((cont2*100)/contzero),'%')
print('..................')
print()
  
#----------------------------

print('>> Forest.... <<')
print('Score',scoref)

c=0
cont=0
cont2=0
for a, b in zip(predictf, Y_test):
    if (a == b):
        c=c+1
        if(a==1):
            cont=cont+1
        else:
            cont2=cont2+1
print(c,'Registros acertados de',total )
print(cont,'"unos" de',contone,'-->',((cont*100)/contone),'%') 
print(cont2,'"zeros" de',contzero,'-->',((cont2*100)/contzero),'%')
print('..................')
print()

#----------------------------

print('>> SVR.... <<')
print('Score',scores)

c=0
cont=0
cont2=0
for a, b in zip(predicts, Y_test):
    if (a == b):
        c=c+1
        if(a==1):
            cont=cont+1
        else:
            cont2=cont2+1
print(c,'Registros acertados de',total )
print(cont,'"unos" de',contone,'-->',((cont*100)/contone),'%') 
print(cont2,'"zeros" de',contzero,'-->',((cont2*100)/contzero),'%')
print('..................')
print()
  
#----------------------------

print('>> KNN.... <<')
print('Score',scorek)

c=0
cont=0
cont2=0
for a, b in zip(predictknn, Y_test):
    if (a == b):
        c=c+1
        if(a==1):
            cont=cont+1
        else:
            cont2=cont2+1
print(c,'Registros acertados de',total )
print(cont,'"unos" de',contone,'-->',((cont*100)/contone),'%') 
print(cont2,'"zeros" de',contzero,'-->',((cont2*100)/contzero),'%')
print('..................')
print()
  

>> LR.... <<
Score 0.844525912347942
5068 Registros acertados de 6001
2 "unos" de 929 --> 0.21528525296017223 %
5066 "zeros" de 5072 --> 99.88170347003154 %
..................

>> Forest.... <<
Score 0.8413597733711048
5049 Registros acertados de 6001
8 "unos" de 929 --> 0.8611410118406889 %
5041 "zeros" de 5072 --> 99.38880126182966 %
..................

>> SVR.... <<
Score 0.8425262456257291
5056 Registros acertados de 6001
6 "unos" de 929 --> 0.6458557588805167 %
5050 "zeros" de 5072 --> 99.56624605678233 %
..................

>> KNN.... <<
Score 0.8298616897183803
4980 Registros acertados de 6001
141 "unos" de 929 --> 15.177610333692142 %
4839 "zeros" de 5072 --> 95.40615141955836 %
..................



## Obtener mejores variables

In [32]:
%%time

from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression

lr=LogisticRegression()    
rfe=RFECV(lr)
rfe.fit(X_normalized,Y.values.ravel())

CPU times: user 16min 15s, sys: 2min 27s, total: 18min 42s
Wall time: 12min 42s


In [60]:
best_variables=[]

for a, b in zip(X_normalized.columns.values.tolist(), rfe.support_):
    if (b==True):
        best_variables.append(a)
        
print(best_variables)        

['Actividad_8123078', 'Actividad_8944098', 'Actividad_9411018']


## Guardar Modelo

In [189]:
import pickle

filename = 'acreedor2.sav'
pickle.dump(knn, open(filename, 'wb'))

## Cargar Modelo

In [20]:
import pickle

filename = 'acreedor2_knn_avar.sav'

knn = pickle.load(open(filename, 'rb'))

scorek=knn.score(X_test,Y_test)

predictknn=knn.predict(X_test)


In [23]:
sss=knn.predict_proba(X_test)

In [22]:
predictknnctknn

array([0, 0, 0, ..., 0, 1, 0], dtype=uint8)

In [25]:
sss

array([[1. , 0. ],
       [0.6, 0.4],
       [1. , 0. ],
       ...,
       [0.8, 0.2],
       [0.2, 0.8],
       [1. , 0. ]])

In [24]:
sss[:,1]

array([0. , 0.4, 0. , ..., 0.2, 0.8, 0. ])

In [389]:
contone=0
contzero=0
total=len(Y_test)

for i in Y_test:
    if(i==1):
        contone=contone+1
    else:
        contzero=contzero+1        


#----------------------------

print('>> KNN.... <<')
print('Score',scorek)

c=0
cont=0
cont2=0
for a, b in zip(predictknn, Y_test):
    if (a == b):
        c=c+1
        if(a==1):
            cont=cont+1
        else:
            cont2=cont2+1
print(c,'Registros acertados de',total )
print(cont,'"unos" de',contone,'-->',((cont*100)/contone),'%') 
print(cont2,'"zeros" de',contzero,'-->',((cont2*100)/contzero),'%')
print('..................')
print()
  

>> KNN.... <<
Score 0.8360273287785369
5017 Registros acertados de 6001
145 "unos" de 924 --> 15.692640692640692 %
4872 "zeros" de 5077 --> 95.96218239117589 %
..................

