In [1]:
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [2]:
url = 'https://raw.githubusercontent.com/61050960/Dataset/main/pima-indians-diabetes1.csv?raw=true'
df = pd.read_csv(url)
df

Unnamed: 0,Number of times pregnant,Plasma glucose concentration a 2 hours in an oral glucose tolerance test,Diastolic blood pressure (mm Hg),Triceps skin fold thickness (mm),2-Hour serum insulin (mu U/ml),Body mass index (weight in kg/(height in m)^2),Diabetes pedigree function,Age (years),Class variable (0 or 1)
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [3]:
data = df.iloc[:,:-1]
label = df.iloc[:,-1]
data

Unnamed: 0,Number of times pregnant,Plasma glucose concentration a 2 hours in an oral glucose tolerance test,Diastolic blood pressure (mm Hg),Triceps skin fold thickness (mm),2-Hour serum insulin (mu U/ml),Body mass index (weight in kg/(height in m)^2),Diabetes pedigree function,Age (years)
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33
...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63
764,2,122,70,27,0,36.8,0.340,27
765,5,121,72,23,112,26.2,0.245,30
766,1,126,60,0,0,30.1,0.349,47


In [7]:
#split Train and Test
X_train, X_test, y_train, y_test = train_test_split(data,label, test_size=0.30,random_state=101)

In [8]:
#Random forest
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=10)
clf_ = clf.fit(X_train, y_train)
predict = clf_.predict(X_test)
print("Accuracy = "+ str(accuracy_score(y_test,predict)))

Accuracy = 0.7489177489177489


In [9]:
def initilization_of_population(size,n_feat):
    population = []
    for i in range(size):
        cs = np.ones(n_feat,dtype=np.bool)
        cs[:int(0.3*n_feat)]=False
        np.random.shuffle(cs)
        population.append(cs)
    return population

In [10]:
def selection(pop_after_fit,n_parents):
    population_nextgen = []
    for i in range(n_parents):
        population_nextgen.append(pop_after_fit[i])
    return population_nextgen

In [11]:
def crossover(pop_after_sel):
    population_nextgen=pop_after_sel
    for i in range(len(pop_after_sel)):
        child=pop_after_sel[i]
        child[3:7]=pop_after_sel[(i+1)%len(pop_after_sel)][3:7]
        population_nextgen.append(child)
    return population_nextgen

In [12]:
def mutation(pop_after_cross,mutation_rate):
    population_nextgen = []
    for i in range(0,len(pop_after_cross)):
        cs = pop_after_cross[i]
        for j in range(len(cs)):
            if random.random() < mutation_rate:
                cs[j]= not cs[j]
        population_nextgen.append(cs)
    #print(population_nextgen)
    return population_nextgen

In [13]:
def fitness_score(population):
    scores = []
    for cs in population:
        clf.fit(X_train.iloc[:,cs],y_train)
        predictions = clf.predict(X_test.iloc[:,cs])
        scores.append(accuracy_score(y_test,predictions))
    scores, population = np.array(scores), np.array(population) 
    inds = np.argsort(scores)
    return list(scores[inds][::-1]), list(population[inds,:][::-1])

In [15]:
def generations(size,n_feat,n_parents,mutation_rate,n_gen,X_train,X_test, y_train, y_test):
    best_c= []
    best_s= []
    population_nextgen=initilization_of_population(size,n_feat)
    for i in range(n_gen):
        scores, pop_after_fit = fitness_score(population_nextgen)
        #print(scores[:2])
        pop_after_sel = selection(pop_after_fit,n_parents)
        pop_after_cross = crossover(pop_after_sel)
        population_nextgen = mutation(pop_after_cross,mutation_rate)
        best_c.append(pop_after_fit[0])
        best_s.append(scores[0])
    return best_c,best_s

In [16]:
chromo,score=generations(size=200,n_feat=8,n_parents=100,mutation_rate=0.01,n_gen=38,X_train=X_train,X_test=X_test,y_train=y_train,y_test=y_test)
clf.fit(X_train.iloc[:,chromo[-1]],y_train)
predict = clf.predict(X_test.iloc[:,chromo[-1]])
print("Accuracy score after genetic algorithm is= "+str(accuracy_score(y_test,predict)))

Accuracy score after genetic algorithm is= 0.7662337662337663
