In [2]:
import pandas as pd

combined_data = pd.read_csv('combined_data.csv').set_index('video_id')
label = combined_data['label']
features = combined_data.drop(['label'], axis=1)

In [25]:
import random
import numpy as np
import pandas as pd
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# Initialize the regression model
model = RandomForestRegressor(n_estimators=100, random_state=0)

# 1. Initial Population
def initialise_population(size, n_feat):
    population = []
    for _ in range(size):
        chromosome = np.ones(n_feat, dtype=bool)
        chromosome[:int(0.3 * n_feat)] = False
        np.random.shuffle(chromosome)
        population.append(chromosome)
    return population

# 2. Fitness Function
def fitness_score(population, X_train, X_test, y_train, y_test):
    scores = []
    for chromosome in population:
        X_train_filtered = X_train[:, chromosome]
        X_test_filtered = X_test[:, chromosome]
        model.fit(X_train_filtered, y_train)
        predictions = model.predict(X_test_filtered)
        mse = mean_squared_error(y_test, predictions)
        # Use negative MSE because we want to maximize fitness (minimize MSE)
        scores.append(-mse)
    scores, population = np.array(scores), np.array(population)
    inds = np.argsort(scores)[::-1]  # Sort so that individuals with higher scores (lower MSE) come first
    return list(scores[inds]), list(population[inds])

# 3. Selection
def selection(pop_after_fit, n_parents):
    return pop_after_fit[:n_parents]

# 4. Crossover
def crossover(pop_after_sel):
    population_nextgen = []
    for i in range(0, len(pop_after_sel)-1, 2):
        child1, child2 = pop_after_sel[i].copy(), pop_after_sel[i+1].copy()
        crossover_point = np.random.randint(1, len(child1)-1)
        child1[crossover_point:], child2[crossover_point:] = child2[crossover_point:], child1[crossover_point:]
        population_nextgen.extend([child1, child2])
    return population_nextgen

# 5. Mutation
def mutation(pop_after_cross, mutation_rate):
    population_nextgen = []
    for chromosome in pop_after_cross:
        chromosome = np.array([not gene if random.random() < mutation_rate else gene for gene in chromosome])
        population_nextgen.append(chromosome)
    return population_nextgen

# Generations
def generations(size, n_feat, n_parents, mutation_rate, n_gen, X_train, X_test, y_train, y_test):
    best_chromo = []
    best_score = []
    population_nextgen = initialise_population(size, n_feat)
    for _ in range(n_gen):
        scores, pop_after_fit = fitness_score(population_nextgen, X_train, X_test, y_train, y_test)
        pop_after_sel = selection(pop_after_fit, n_parents)
        pop_after_cross = crossover(pop_after_sel)
        population_nextgen = mutation(pop_after_cross, mutation_rate)
        best_chromo.append(pop_after_fit[0])
        best_score.append(scores[0])
    return best_chromo, best_score


In [26]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

x_train, x_test, y_train, y_test = train_test_split(features.values, label.values, test_size=0.2, random_state=0)

sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)
print('check_data_split', [x_train.shape, x_test.shape, y_train.shape, y_test.shape])

check_data_split [(123653, 38), (30914, 38), (123653,), (30914,)]


In [None]:
# Example usage (assuming X_train, X_test, y_train, y_test are already defined)
size = 50  # Population size
n_feat = x_train.shape[1]  # Number of features
n_parents = 25  # Number of parents for crossover
mutation_rate = 0.01  # Mutation rate
n_gen = 100  # Number of generations
best_chromo, best_score = generations(size, n_feat, n_parents, mutation_rate, n_gen, x_train, x_test, y_train, y_test)
print("Best Score:", best_score[-1])

In [ ]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import BaggingRegressor

x_train_genetic = x_train[:, best_chromo[-1]]
x_test_genetic = x_test[:, best_chromo[-1]]
n_estimators=140
# Training classifiers
grad_boosting_regressor = GradientBoostingRegressor(random_state=1,)
ada_boosting_regressor = AdaBoostRegressor(random_state=1, n_estimators=n_estimators)
random_forest_regressor = RandomForestRegressor(random_state=1)
bagging_regressor = BaggingRegressor(random_state=1, n_estimators=n_estimators)
linear_regr = LinearRegression()
voting_reg = VotingRegressor(estimators=[('gb', grad_boosting_regressor), ('rf', random_forest_regressor), ('lr', linear_regr), ('ada_b', ada_boosting_regressor), ('bagging_r', bagging_regressor)]) 

In [ ]:
# Traing models - GradientBoostingRegressor, RandomForestRegressor, LinearRegression, AdaBoostRegressor, BaggingRegressor, VotingRegressor
grad_boosting_regressor.fit(x_train_genetic, y_train)
ada_boosting_regressor.fit(x_train_genetic, y_train)
bagging_regressor.fit(x_train_genetic, y_train)
random_forest_regressor.fit(x_train_genetic, y_train)
linear_regr.fit(x_train_genetic, y_train)
voting_reg.fit(x_train_genetic, y_train)

In [ ]:
gdb_prediction = grad_boosting_regressor.predict(x_test_Trans)
rf_prediction = random_forest_regressor.predict(x_test_Trans)
lg_prediction = linear_regr.predict(x_test_Trans)
ada_b_prediction = ada_boosting_regressor.predict(x_test_Trans)
bagging_r_prediction = bagging_regressor.predict(x_test_Trans)
voting_reg_prediction = voting_reg.predict(x_test_Trans)

In [ ]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import explained_variance_score

print('mean squared error', 'r2 score', 'explained variance score')
print('GradientBoostingRegressor',mean_squared_error(y_test, gdb_prediction),r2_score(y_test, gdb_prediction), explained_variance_score(y_test, gdb_prediction))
print('RandomForestRegressor',mean_squared_error(y_test, rf_prediction),r2_score(y_test, rf_prediction), explained_variance_score(y_test, rf_prediction))
print('LinearRegression',mean_squared_error(y_test, lg_prediction),r2_score(y_test, lg_prediction), explained_variance_score(y_test, lg_prediction))
print('AdaBoostRegressor',mean_squared_error(y_test, ada_b_prediction),r2_score(y_test, ada_b_prediction), explained_variance_score(y_test, ada_b_prediction))
print('BaggingRegressor',mean_squared_error(y_test, bagging_r_prediction),r2_score(y_test, bagging_r_prediction), explained_variance_score(y_test, bagging_r_prediction))
print('VotingRegressor',mean_squared_error(y_test, voting_reg_prediction),r2_score(y_test, voting_reg_prediction), explained_variance_score(y_test, voting_reg_prediction))