In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import random
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import string
import re
import json

stop_words = set(stopwords.words('english'))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Read Data

In [None]:
data_list = []
dataset1 = pd.read_csv('/content/drive/MyDrive/GA_algorithm/data/samples.csv')
dataset1['sentiment'] = dataset1['sentiment'].replace({'negative': 0, 'positive': 1})
data_list.append(dataset1)

dataset2 = pd.read_csv("/content/drive/MyDrive/GA_algorithm/data/true_false.csv")
dataset2['class'] = dataset2['class'].replace({'fake': 0, 'true': 1})
data_list.append(dataset2)

dataset3 = pd.read_csv("/content/drive/MyDrive/GA_algorithm/data/amazon_150.csv")
dataset3 = dataset3.drop('Unnamed: 0', axis = 1)
dataset3 = dataset3.reindex(columns=['text', 'class'])
dataset3['class'] = dataset3['class'].replace({'__label__1': 0, '__label__2': 1})
data_list.append(dataset3)

# Cleaning Data

In [None]:

for i, data in enumerate(data_list):
  data_list[i] = data.rename(columns={data.columns[0]: 'text', data.columns[1]: 'label'})
  data_list[i]['text'] = data_list[i]['text'].replace(r'<br />', '', regex=True)
  data_list[i]['text'] = data_list[i]['text'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words]))
  data_list[i]['text'] = data_list[i]['text'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
  data_list[i]['text'] = data_list[i]['text'].str.replace(r'\([^)]*\)', '')

data_list[0]

  data_list[i]['text'] = data_list[i]['text'].str.replace(r'\([^)]*\)', '')
  data_list[i]['text'] = data_list[i]['text'].str.replace(r'\([^)]*\)', '')
  data_list[i]['text'] = data_list[i]['text'].str.replace(r'\([^)]*\)', '')


Unnamed: 0,text,label
0,first film proposed series achieves right bala...,1
1,Hoot nice young persons film group middle scho...,1
2,wonderful movie struggle Mormons final settlem...,1
3,A Slight Case Murder excellent TV movie defian...,1
4,hardly movie all rather real vaudeville show f...,1
...,...,...
145,luminously photographed unusually wellwritten ...,1
146,film wonderful romp intelligent playful myster...,1
147,WOW movie horrible Im glad pay money see horri...,0
148,viewed first two nights coming IMDb looking ac...,0


In [None]:
data_list[1]

Unnamed: 0,text,label
0,Honduran opposition candidate Nasralla says I ...,1
1,Egypts Sisi discusses repercussions US embassy...,1
2,Iraqi government asks Kurdistan regional gover...,1
3,Senate intelligence panel seeks details White ...,1
4,Erdogan Pope say phone call attempts change Je...,1
...,...,...
145,OOPS DOCUMENTS SHOW Roy Moore Represented Accu...,0
146,BREAKING Video BLACK SUPREMACY TERROR GROUP PL...,0
147,NFL LEGEND Supported Hillary Leaves CNN Host S...,0
148,NFL LEGEND Supported Hillary Leaves CNN Host S...,0


# Create Indivisuals

In [None]:
# create class for indivisuals
class Individual:
    def init(self, label1_word_list,label0_word_list, fitness=0):
        self.label0_word_list = label0_word_list
        self.label1_word_list = label1_word_list
        self.fitness = fitness

def create_word_pool(data):
  word_pool = []
  for i,row in data.iterrows():
      word_pool.extend(row['text'].split(' '))
  return word_pool


def create_population(population_size, list_length, word_pool):
    population = []
    for i in range(population_size):
        label1_word_list = random.sample(word_pool, list_length)
        label0_word_list = random.sample(word_pool, list_length)
        individual = Individual()
        individual.label1_word_list = label1_word_list
        individual.label0_word_list = label0_word_list
        population.append(individual)
    return population

def select_parents(population):
  n = len(population)
  probs = [(i+1)/(n*(n+1)//2) for i in range(n)]

  return random.choices(population, probs, k = 2)

def crossover(p1, p2):
  cross_point1 = random.randint(0, len(p1.label1_word_list))
  cross_point2 = random.randint(0, len(p1.label0_word_list))

  c1 = c2 = Individual()

  c1.label1_word_list = p1.label1_word_list[:cross_point1] + p2.label1_word_list[cross_point1:]
  c1.label0_word_list = p1.label0_word_list[:cross_point1] + p2.label0_word_list[cross_point1:]

  c2.label1_word_list = p1.label1_word_list[:cross_point2] + p2.label1_word_list[cross_point2:]
  c2.label0_word_list = p1.label0_word_list[:cross_point2] + p2.label0_word_list[cross_point2:]
  return c1, c2

def mutation(word_pool, c, mutation_rate):
  for i in range(len(c.label1_word_list)):
    r = random.random()
    if r < mutation_rate:
      c.label1_word_list[i] =  random.choice(word_pool)
      c.label0_word_list[i] =  random.choice(word_pool)
  return c

def fitness_function(individual, data):
    label1_words = individual.label1_word_list
    label2_words = individual.label0_word_list
    correct_classifications = 0
    for i,row in data.iterrows():
        label1_count = 0
        label2_count = 0
        words = set(row['text'].split())
        for word in words:
            if word in label1_words:
                label1_count += 1
            elif word in label2_words:
                label2_count += 1
        if label1_count > label2_count:
            if row['label'] == 1:
                correct_classifications += 1
        elif label2_count > label1_count:
            if row['label'] == 0:
                correct_classifications += 1
        else:
            correct_classifications += random.randrange(0, 2, 1)
    fitness_value = correct_classifications / len(data)
    return fitness_value

def genetic_algorithm(dataset, population, word_pool, mutation_rate, num_iterations):
    hist = {}
    hist['avg'] = []
    hist['best'] = []
    for iteration in range(num_iterations):

        offspring = []
        parents = []

        for i in range(len(population)//2):
          p1, p2 = select_parents(population)
          c1, c2 = crossover(p1, p2)
          c1 = mutation(word_pool, c1, mutation_rate)
          c2 = mutation(word_pool, c2, mutation_rate)

          parents.extend([p1, p2])
          offspring.extend([c1, c2])

        generation = parents + offspring
        for individual in generation:
            individual.fitness = fitness_function(individual, dataset)

        generation = sorted(generation,key=lambda x: x.fitness)

        population = generation[len(population):]
        fitness_values = [ind.fitness for ind in population]
        avg = sum(fitness_values) / len(fitness_values)
        hist['avg'].append(avg)
        hist['best'].append(population[-1].fitness)

        print(f'iteration {iteration+1}, best fitness: {population[-1].fitness}.')
    # return the fittest individual in the final population
    return max(population, key=lambda x: x.fitness), hist


In [None]:

word_pool = create_word_pool(data_list[0])

pop_sizes = [20,60,100]
list_lengths = [100, 300, 500]
mutation_rates = [.05, .1, .2]

hist_array = []
for size in pop_sizes:
  for length in list_lengths:
    for rate in mutation_rates:

      population = create_population(size, length, word_pool)
      for ind in population:
        ind.fitness = fitness_function(ind, data)

      population = sorted(population,key=lambda x: x.fitness)
      print(f'best fitness in initialization {population[-1].fitness}.')
      best_ind, hist = genetic_algorithm(data, population, word_pool, mutation_rate = rate, num_iterations = 100)
      hist['pop_size'] = size
      hist['list_length'] = length
      hist['mutation_rate'] = rate
      hist_array.append(hist)

In [None]:
file_path = '/content/drive/MyDrive/GA_algorithm/hist.json'
with open(file_path, 'r') as f:
    hist_array = json.load(f)

In [None]:
pop_sizes = [20,60,100]
list_lengths = [100, 300, 500]
mutation_rates = [.05, .1, .2]

for pop_size in pop_sizes:
    # Filter the data for the current population size
  filtered_data = [d for d in hist_array if d['pop_size'] == pop_size]

    # Get the unique mutation rates
  mutation_rates = sorted(set([d['mutation_rate'] for d in filtered_data]))

  # Create a plot for each mutation rate
  fig = go.Figure()
  for mutation_rate in mutation_rates:
      # Filter the data for the current mutation rate
      filtered_data2 = [d for d in filtered_data if d['mutation_rate'] == mutation_rate]

      # Extract the average and best values
      avg_values = filtered_data2[0]['avg']
      best_values = filtered_data2[0]['best']

      # Create traces for the average and best values
      fig.add_trace(go.Scatter(x=list(range(len(avg_values))),
                                y=avg_values,
                                mode='lines',
                                name='Average values (mutation rate: {})'.format(mutation_rate)))
      fig.add_trace(go.Scatter(x=list(range(len(best_values))),
                                y=best_values,
                                mode='lines',
                                name='Best values (mutation rate: {})'.format(mutation_rate)))

      fig.update_layout(title_text=f"Best and Average Performance for Population Size {filtered_data[0]['pop_size']} Across Varying Mutation Rates",
                        xaxis=dict(title='Number of Generation'),
                        yaxis=dict(title='Fitness'),
                        width=800,   # Set the width of the plot to 800 pixels
                        height=500   # Set the height of the plot to 600 pixels
                        )

  fig.show()


In [None]:
for pop_size in pop_sizes:
    # Filter the data for the current population size
  filtered_data = [d for d in hist_array if d['pop_size'] == pop_size]

    # Get the unique mutation rates
  list_lengths = sorted(set([d['list_length'] for d in filtered_data]))

  # Create a plot for each mutation rate
  fig = go.Figure()
  for list_length in list_lengths:
      # Filter the data for the current mutation rate
      filtered_data2 = [d for d in filtered_data if d['list_length'] == list_length]

      # Extract the average and best values
      avg_values = filtered_data2[0]['avg']
      best_values = filtered_data2[0]['best']

      # Create traces for the average and best values
      fig.add_trace(go.Scatter(x=list(range(len(avg_values))),
                                y=avg_values,
                                mode='lines',
                                name='Average values (list Length: {})'.format(list_length*2)))
      fig.add_trace(go.Scatter(x=list(range(len(best_values))),
                                y=best_values,
                                mode='lines',
                                name='Best values (list Length: {})'.format(list_length*2)))
      i += 1

      fig.update_layout(title_text=f"Best and Average Performance for Population Size {filtered_data[0]['pop_size']} Across Varying List Lengths",
                        xaxis=dict(title='Number of Generation'),
                        yaxis=dict(title='Fitness'),
                        width=800,   # Set the width of the plot to 800 pixels
                        height=500   # Set the height of the plot to 600 pixels
                        )

  fig.show()


In [None]:

for list_length in list_lengths:
    # Filter the data for the current population size
  filtered_data = [d for d in hist_array if d['list_length'] == list_length]

    # Get the unique mutation rates
  mutation_rates = sorted(set([d['mutation_rate'] for d in filtered_data]))

  # Create a plot for each mutation rate
  fig = go.Figure()
  for mutation_rate in mutation_rates:
      # Filter the data for the current mutation rate
      filtered_data2 = [d for d in filtered_data if d['mutation_rate'] == mutation_rate]

      # Extract the average and best values
      avg_values = filtered_data2[0]['avg']
      best_values = filtered_data2[0]['best']

      # Create traces for the average and best values
      fig.add_trace(go.Scatter(x=list(range(len(avg_values))),
                                y=avg_values,
                                mode='lines',
                                name='Average values (mutation rate: {})'.format(mutation_rate)))
      fig.add_trace(go.Scatter(x=list(range(len(best_values))),
                                y=best_values,
                                mode='lines',
                                name='Best values (mutation rate: {})'.format(mutation_rate)))

      fig.update_layout(title_text=f"Best and Average Performance for List Length {filtered_data[0]['list_length']*2} Across Varying Mutation Rates",
                        xaxis=dict(title='Number of Generation'),
                        yaxis=dict(title='Fitness'),
                        width=800,   # Set the width of the plot to 800 pixels
                        height=500   # Set the height of the plot to 600 pixels
                        )
  fig.show()

In [None]:
data = data_list[0]
word_pool = create_word_pool(data)
population = create_population(60, 300, word_pool)


for ind in population:
  ind.fitness = fitness_function(ind, data)
population = sorted(population,key=lambda x: x.fitness)
print(f'Best Fitness in Initialization: {population[-1].fitness}.')
best_ind, hist = genetic_algorithm(data, population, word_pool, mutation_rate = .05, num_iterations = 60)


In [None]:
trace1 = go.Scatter(
    x=list(range(len(hist['avg']))),
    y=hist['avg'],
    mode='lines',
    name='avg'
)

# Create the trace for the "best" line
trace2 = go.Scatter(
    x=list(range(len(hist['best']))),
    y=hist['best'],
    mode='lines',
    name='best'
)

# Define the layout of the plot
layout = go.Layout(
    title='Plot using Plotly',
    xaxis=dict(title='X-axis label'),
    yaxis=dict(title='Y-axis label'),
    legend=dict(title='Legend'),
    width=800,   # Set the width of the plot to 800 pixels
    height=500   # Set the height of the plot to 600 pixels
)

# Create the figure object and plot the traces
fig = go.Figure(data=[trace1, trace2], layout=layout)
fig.show()

# Y-axis labels is the accuracy and X-axis label is number of iterations


In [None]:
data = data_list[1]
word_pool = create_word_pool(data)
population = create_population(60, 300, word_pool)


for ind in population:
  ind.fitness = fitness_function(ind, data)
population = sorted(population,key=lambda x: x.fitness)
print(f'best fitness in initialization{population[-1].fitness}.')
best_ind, hist = genetic_algorithm(data, population, word_pool, mutation_rate = .05, num_iterations = 60)


In [None]:
trace1 = go.Scatter(
    x=list(range(len(hist['avg']))),
    y=hist['avg'],
    mode='lines',
    name='avg'
)

# Create the trace for the "best" line
trace2 = go.Scatter(
    x=list(range(len(hist['best']))),
    y=hist['best'],
    mode='lines',
    name='best'
)

# Define the layout of the plot
layout = go.Layout(
    title='Plot using Plotly',
    xaxis=dict(title='X-axis label'),
    yaxis=dict(title='Y-axis label'),
    legend=dict(title='Legend'),
    width=800,   # Set the width of the plot to 800 pixels
    height=500   # Set the height of the plot to 600 pixels
)

# Create the figure object and plot the traces
fig = go.Figure(data=[trace1, trace2], layout=layout)
fig.show()

# Y-axis labels is the accuracy and X-axis label is number of iterations


In [None]:
data = data_list[2]
word_pool = create_word_pool(data)
population = create_population(60, 300, word_pool)


for ind in population:
  ind.fitness = fitness_function(ind, data)
population = sorted(population,key=lambda x: x.fitness)
print(f'best fitness in initialization{population[-1].fitness}.')
best_ind, hist = genetic_algorithm(data, population, word_pool, mutation_rate = .05, num_iterations = 60)


best fitness in initialization0.56.
iteration 1, best fitness 0.56.
iteration 2, best fitness 0.5533333333333333.
iteration 3, best fitness 0.56.
iteration 4, best fitness 0.5733333333333334.
iteration 5, best fitness 0.5733333333333334.
iteration 6, best fitness 0.6066666666666667.
iteration 7, best fitness 0.6066666666666667.
iteration 8, best fitness 0.62.
iteration 9, best fitness 0.6266666666666667.
iteration 10, best fitness 0.6266666666666667.
iteration 11, best fitness 0.6333333333333333.
iteration 12, best fitness 0.64.
iteration 13, best fitness 0.6333333333333333.
iteration 14, best fitness 0.64.
iteration 15, best fitness 0.64.
iteration 16, best fitness 0.64.
iteration 17, best fitness 0.6466666666666666.
iteration 18, best fitness 0.6533333333333333.
iteration 19, best fitness 0.6533333333333333.
iteration 20, best fitness 0.66.
iteration 21, best fitness 0.6533333333333333.
iteration 22, best fitness 0.6666666666666666.
iteration 23, best fitness 0.6733333333333333.
iter

In [None]:
trace1 = go.Scatter(
    x=list(range(len(hist['avg']))),
    y=hist['avg'],
    mode='lines',
    name='avg'
)

# Create the trace for the "best" line
trace2 = go.Scatter(
    x=list(range(len(hist['best']))),
    y=hist['best'],
    mode='lines',
    name='best'
)

# Define the layout of the plot
layout = go.Layout(
    title='Plot using Plotly',
    xaxis=dict(title='X-axis label'),
    yaxis=dict(title='Y-axis label'),
    legend=dict(title='Legend'),
    width=800,   # Set the width of the plot to 800 pixels
    height=500   # Set the height of the plot to 600 pixels
)

# Create the figure object and plot the traces
fig = go.Figure(data=[trace1, trace2], layout=layout)
fig.show()

# Y-axis labels is the accuracy and X-axis label is number of iterations
