In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC, LinearSVC
import time
from IPython.display import display

In [7]:
numericalDF = pd.read_csv('numericalData.csv')
columnas = ['directores','criticas','duracion','genero','actores1','actores2','actores3','rostros',
            'scores','gross','budget','class']
del numericalDF['Unnamed: 0']
for i in columnas:
    numericalDF.loc[numericalDF[i] == 0, i] = numericalDF[i].mean()
df = numericalDF
df_pure = numericalDF[list(['directores','criticas','duracion','genero','actores1','actores2','actores3','rostros',
            'gross','budget'])]
df_class = numericalDF[list(['class'])]
numericalDF.head()

Unnamed: 0,directores,criticas,duracion,genero,actores1,actores2,actores3,rostros,scores,gross,budget,class
0,0.024778,0.003295,0.011543,0.007769,0.03402,0.029215,0.027567,0.007924,0.014223,1.6e-05,7.773335e-08,1.0
1,0.031655,0.001073,0.012825,0.011654,0.015347,0.000761,0.027559,0.028946,0.013576,2e-06,0.002522814,1.0
2,0.031642,0.000996,0.009747,0.018128,0.034004,0.029206,0.027551,0.007924,0.013576,0.007426,9.893336e-08,1.0
3,0.001505,0.003295,0.005515,0.022013,0.033988,0.029196,0.027543,0.005789,0.016162,0.007426,0.002522814,2.0
4,0.031629,7.7e-05,0.011158,0.011654,0.024448,0.029186,0.006778,0.011579,0.016593,0.007426,0.002522814,2.0


In [16]:
X_trainPID, X_testPID, y_trainPID, y_testPID = train_test_split(
    df_pure.values, df_class.values.ravel(), test_size=.2)

In [17]:
svmLineal = LinearSVC()
start_time = time.time()
svmLineal.fit(X_trainPID, y_trainPID)
elapsed_time = time.time() - start_time

preds_train_Lineal = svmLineal.predict(X_trainPID)
fails_train_Lineal = np.sum(y_trainPID != preds_train_Lineal)

preds_Lineal = svmLineal.predict(X_testPID)
fails_Lineal = np.sum(y_testPID != preds_Lineal)

print("SVM Lineal, C=1 (default)\nPuntos mal clasificados (entrenamiento): {} de {} ({}%)\
       \nPuntos mal clasificados (prueba): {} de {} ({}%)\
       \nAciertos del {}%\nTiempo: {}\n"
      .format(fails_train_Lineal, len(y_trainPID), 100*fails_train_Lineal/len(y_trainPID),
              fails_Lineal, len(y_testPID), 100*fails_Lineal/len(y_testPID), 
              svmLineal.score(X_testPID, y_testPID)*100, elapsed_time))

SVM Lineal, C=1 (default)
Puntos mal clasificados (entrenamiento): 1230 de 4028 (30.536246276067526%)       
Puntos mal clasificados (prueba): 326 de 1008 (32.34126984126984%)       
Aciertos del 67.65873015873017%
Tiempo: 0.02552962303161621



In [25]:
svmRbf = SVC(kernel='rbf', C=1.0)
start_time = time.time()
svmRbf.fit(X_trainPID, y_trainPID)
elapsed_time = time.time() - start_time

preds_train_Rbf = svmRbf.predict(X_trainPID)
fails_train_Rbf = np.sum(y_trainPID != preds_train_Rbf)

preds_Rbf = svmRbf.predict(X_testPID)
fails_Rbf = np.sum(y_testPID != preds_Rbf)

print("SVM RBF, C=1.0\nPuntos mal clasificados (entrenamiento): {} de {} ({}%)\
       \nPuntos mal clasificados (prueba): {} de {} ({}%)\
       \nAciertos del {}%\nTiempo: {}\n"
      .format(fails_train_Rbf, len(y_trainPID), 100*fails_train_Rbf/len(y_trainPID),
              fails_Rbf, len(y_testPID), 100*fails_Rbf/len(y_testPID), 
              svmRbf.score(X_testPID, y_testPID)*100, elapsed_time))

SVM RBF, C=1.0
Puntos mal clasificados (entrenamiento): 1257 de 4028 (31.206554121151935%)       
Puntos mal clasificados (prueba): 333 de 1008 (33.035714285714285%)       
Aciertos del 66.96428571428571%
Tiempo: 0.5609030723571777



In [26]:
svmRbf = SVC(kernel='rbf', C=500, gamma=0.1)
start_time = time.time()
svmRbf.fit(X_trainPID, y_trainPID)
elapsed_time = time.time() - start_time

preds_train_Rbf = svmRbf.predict(X_trainPID)
fails_train_Rbf = np.sum(y_trainPID != preds_train_Rbf)

preds_Rbf = svmRbf.predict(X_testPID)
fails_Rbf = np.sum(y_testPID != preds_Rbf)

print("SVM RBF, C=500\nPuntos mal clasificados (entrenamiento): {} de {} ({}%)\
       \nPuntos mal clasificados (prueba): {} de {} ({}%)\
       \nAciertos del {}%\nTiempo: {}\n"
      .format(fails_train_Rbf, len(y_trainPID), 100*fails_train_Rbf/len(y_trainPID),
              fails_Rbf, len(y_testPID), 100*fails_Rbf/len(y_testPID), 
              svmRbf.score(X_testPID, y_testPID)*100, elapsed_time))


SVM RBF, C=500
Puntos mal clasificados (entrenamiento): 1213 de 4028 (30.114200595829196%)       
Puntos mal clasificados (prueba): 320 de 1008 (31.746031746031747%)       
Aciertos del 68.25396825396825%
Tiempo: 0.6389672756195068



In [28]:
svmSgm = SVC(kernel='sigmoid')
start_time = time.time()
svmSgm.fit(X_trainPID, y_trainPID)
elapsed_time = time.time() - start_time

preds_train_Sgm = svmSgm.predict(X_trainPID)
fails_train_Sgm = np.sum(y_trainPID != preds_train_Sgm)

preds_Sgm = svmSgm.predict(X_testPID)
fails_Sgm = np.sum(y_testPID != preds_Sgm)

print("Diabetes\nSVM Sigmoide, C=1.0\nPuntos mal clasificados (entrenamiento): {} de {} ({}%)\
       \nPuntos mal clasificados (prueba): {} de {} ({}%)\
       \nAciertos del {}%\nTiempo: {}\n"
      .format(fails_train_Sgm, len(y_trainPID), 100*fails_train_Sgm/len(y_trainPID),
              fails_Sgm, len(y_testPID), 100*fails_Sgm/len(y_testPID), 
              svmSgm.score(X_testPID, y_testPID)*100, elapsed_time))

Diabetes
SVM Sigmoide, C=1.0
Puntos mal clasificados (entrenamiento): 1257 de 4028 (31.206554121151935%)       
Puntos mal clasificados (prueba): 333 de 1008 (33.035714285714285%)       
Aciertos del 66.96428571428571%
Tiempo: 1.2904388904571533

