In [27]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFECV
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.compose import make_column_transformer
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score, confusion_matrix, classification_report, accuracy_score
from sklearn.externals import joblib




In [2]:
description={}
file=open('description.txt','r')
for l in file:
    data=[str(d)for d in l.split(": ")]
    name=[str(n) for n in data[0].split(") ") ]
    descr=[str(des)for des in data[1].split("\n")]
    description[name[1]]=descr[0]
    
# as the given names are not significant at all, we use the description as names

names=[]
for key,value in description.items():
    names.append(value)
# exept for the target value which is too descriptive
names[-1]='experimental class'
names
data=pd.read_csv('biodeg.csv',names=names, delimiter=';')
df=data.copy()

In [3]:
def col_by_name(dataframe,name):
    res=[]
    for col in dataframe.columns:
        if name.lower() in col.lower():
            res.append(col)
    return res

def col_by_type(dataframe,dtype):
    res=[]
    for col in dataframe.select_dtypes(dtype):
            res.append(col)
    return res

In [4]:
trainset,testset=train_test_split(df,test_size=0.2, random_state=1)

In [5]:
def encryption(df):
    code={'RB':1,'NRB':0}
    for col in df.select_dtypes(include=['object']).columns:
        df.loc[:,col]=df[col].map(code)
    return df

def preprocessing(df):
    
    df=encryption(df)
    
    X=df.drop('experimental class', axis=1)
    y=df['experimental class']
    
    
    print(y.value_counts())
    return X,y

In [6]:
X_train, y_train=preprocessing(trainset)
X_test, y_test=preprocessing(testset)

0    567
1    277
Name: experimental class, dtype: int64
0    132
1     79
Name: experimental class, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [7]:
continuous_features=col_by_type(df,'float64')
continuous_pipeline=make_pipeline(StandardScaler())
transformer=make_column_transformer((continuous_pipeline,continuous_features))

In [8]:
selector = RFECV(SGDClassifier(random_state=0),
                step=1,
                min_features_to_select=10,
                cv=5)

In [9]:
preprocessor_other=make_pipeline(transformer, selector)

In [11]:
SVM=make_pipeline(preprocessor_other, SVC(random_state=0))
hyper_parameters_svm={'svc__C':range(1,10000), 'svc__gamma': [1e-6,1e-5,1e-4,1e-3,1e-2,1e-1,1]}
grid=RandomizedSearchCV(SVM,hyper_parameters_svm,scoring='accuracy',cv=4,n_iter=40)

grid.fit(X_train,y_train)
                  

RandomizedSearchCV(cv=4, error_score='raise-deprecating',
                   estimator=Pipeline(memory=None,
                                      steps=[('pipeline',
                                              Pipeline(memory=None,
                                                       steps=[('columntransformer',
                                                               ColumnTransformer(n_jobs=None,
                                                                                 remainder='drop',
                                                                                 sparse_threshold=0.3,
                                                                                 transformer_weights=None,
                                                                                 transformers=[('pipeline',
                                                                                                Pipeline(memory=None,
                                                    

In [12]:
def best_theshold(precision, recall, threshold):
    res=0
    for i in range(len(precision)-1):
        if precision[i]==recall[i]:
            res=threshold[i]
    return res

In [13]:
precision, recall, threshold=precision_recall_curve(y_test, grid.best_estimator_.decision_function(X_test))
bt = bt=best_theshold(precision, recall, threshold)

In [17]:
def model_final(X):
    return grid.best_estimator_.decision_function(X)>bt

In [20]:
y_pred=model_final(X_test)
accuracy_score(y_test, y_pred)

0.8625592417061612

In [21]:
f1_score(y_test, y_pred)

0.8152866242038216

In [32]:
model=grid.best_estimator_
model.
model.fit(X_train, y_train)
model.score(X_test,y_test)

0.8578199052132701

In [33]:
joblib.dump(model, filename = 'model.sav')


['model.sav']