In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
%matplotlib inline
df=pd.read_table('F:/Data_Science/fruit_data_with_colors.txt')

In [2]:
df.head()

Unnamed: 0,fruit_label,fruit_name,fruit_subtype,mass,width,height,color_score
0,1,apple,granny_smith,192,8.4,7.3,0.55
1,1,apple,granny_smith,180,8.0,6.8,0.59
2,1,apple,granny_smith,176,7.4,7.2,0.6
3,2,mandarin,mandarin,86,6.2,4.7,0.8
4,2,mandarin,mandarin,84,6.0,4.6,0.79


In [3]:
X=df.drop(['fruit_label','fruit_name','fruit_subtype'],axis=1)
y=df['fruit_name']

In [4]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import Lasso
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler

In [5]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=42)

In [6]:
s=StandardScaler()
X_train=s.fit_transform(X_train)
X_test=s.transform(X_test)

In [8]:
#svmodel=SVC()
#svmodel.fit(X_train,y_train)
#svmodel.score(X_test,y_test)

In [9]:
from sklearn.model_selection import ShuffleSplit
cv=ShuffleSplit(n_splits=5, test_size=0.25, random_state=42)
cross_val_score(SVC(),X,y,cv=cv).mean()

0.31999999999999995

In [17]:
def cross_val_score_model(model):
    cv = ShuffleSplit(n_splits=5, test_size=0.25, random_state=42)
    cv_score=cross_val_score(model,X,y,cv=cv).mean()
    print('CV_Score' + ' '+ str(model) +': '+ str(cv_score))

cross_val_score_model(SVC())
cross_val_score_model(RandomForestClassifier())
cross_val_score_model(DecisionTreeClassifier())
cross_val_score_model(LogisticRegression(solver='liblinear',multi_class='auto'))

CV_Score SVC(): 0.31999999999999995
CV_Score RandomForestClassifier(): 0.9199999999999999
CV_Score DecisionTreeClassifier(): 0.8400000000000001
CV_Score LogisticRegression(solver='liblinear'): 0.7466666666666666


In [18]:
model_params={
    'svm':{
        'model':SVC(),
        'params':{
            'C':[100,400,600,800,1000],
            'kernel':['rbf','linear']
        }
    },
    'random_forest':{
        'model':RandomForestClassifier(),
        'params':{
            'n_estimators':[1,5,100,1000],
            'criterion' : ['gini', 'entropy']
        }
    },
    'decisionTree':{
        'model':DecisionTreeClassifier(),
        'params':{
            'criterion' : ['gini', 'entropy'],
            'splitter':['best', 'random']
            }
    },
    'logistic regression':{
        'model':LogisticRegression(solver='liblinear',multi_class='auto'),
        'params':{
            'C':[1,5,10]
        }
            
    }

}


In [22]:
scores1=[]
cv = ShuffleSplit(n_splits=5, test_size=0.25, random_state=42)
for model_name, mp in model_params.items():
    random_clf=RandomizedSearchCV(mp['model'],mp['params'],cv=cv, return_train_score=False)
    random_clf.fit(X,y)
    scores1.append({
        'model':model_name,
        'best_score':random_clf.best_score_,
        'best param':random_clf.best_params_,
        'best estimator':random_clf.best_estimator_
    })

ds=pd.DataFrame(scores1,columns=['model','best_score','best param','best estimator'])
ds



Unnamed: 0,model,best_score,best param,best estimator
0,svm,0.853333,"{'kernel': 'linear', 'C': 1000}","SVC(C=1000, kernel='linear')"
1,random_forest,0.96,"{'n_estimators': 100, 'criterion': 'gini'}","(DecisionTreeClassifier(max_features='auto', r..."
2,decisionTree,0.893333,"{'splitter': 'best', 'criterion': 'entropy'}",DecisionTreeClassifier(criterion='entropy')
3,logistic regression,0.813333,{'C': 5},"LogisticRegression(C=5, solver='liblinear')"


In [23]:
def display_text_max_col_width(df, width):
    with pd.option_context('display.max_colwidth', width):
        print(df)

display_text_max_col_width(ds["best param"], 800)

0                 {'kernel': 'linear', 'C': 1000}
1      {'n_estimators': 100, 'criterion': 'gini'}
2    {'splitter': 'best', 'criterion': 'entropy'}
3                                        {'C': 5}
Name: best param, dtype: object


In [24]:
cross_val_score_model(SVC(kernel='linear',C=1000))
cross_val_score_model(RandomForestClassifier(n_estimators= 100,criterion='gini'))
cross_val_score_model(DecisionTreeClassifier(criterion='entropy',splitter='best'))
cross_val_score_model(LogisticRegression(solver='liblinear',multi_class='auto',C=5))

CV_Score SVC(C=1000, kernel='linear'): 0.8533333333333333
CV_Score RandomForestClassifier(): 0.9733333333333334
CV_Score DecisionTreeClassifier(criterion='entropy'): 0.8800000000000001
CV_Score LogisticRegression(C=5, solver='liblinear'): 0.8133333333333332


# BEST MODEL 'RANDOM FOREST CLASSIFIER' WITH HYPER TUNING PARAMETER

In [25]:
rf_clas_best=(RandomForestClassifier(n_estimators= 100,criterion='gini'))
rf_clas_best.fit(X_train,y_train)
rf_clas_best.score(X_test,y_test)

0.9333333333333333

In [26]:
df.iloc[55]

fruit_label            4
fruit_name         lemon
fruit_subtype    unknown
mass                 116
width                6.3
height               7.7
color_score         0.72
Name: 55, dtype: object

In [27]:
a=[[116,6.3,7.7,0.72]]
a=s.transform(a)
b=rf_clas_best.predict(a)
b

array(['lemon'], dtype=object)