In [1]:
import random
import cudf as pd
import cupy as np

df10_standardized = pd.read_csv("cudf_tru20_al.csv", sep='\t')

def calculate_factor_values(df, factor_expression):
    return df.eval(factor_expression)

def fget_average_pAT10(df, factor_expression):
    df['factor_value'] = calculate_factor_values(df, factor_expression)
    top10_per_user = df.sort_values(['userID', 'factor_value'], ascending=[True, False]).groupby('userID').head(10)
    relevant_count = top10_per_user.groupby('userID')['isFriend'].sum()
    pAT10 = relevant_count / 10.0
    avg_pAT10 = pAT10.mean()
    del df['factor_value']
    del top10_per_user
    return avg_pAT10

def nnget_average_pAT10(df, factor_expression):
    df['factor_value'] = df.eval(factor_expression)
    isFriend_zero_df = df[df['isFriend'] == 0]
    top10_per_user = isFriend_zero_df.sort_values(['userID', 'factor_value'], ascending=[True, False]).groupby('userID').head(10)
    # 计算pAT5
    pAT5 = (top10_per_user.groupby('userID').head(5)['change'] == 1).groupby(top10_per_user['userID']).sum() / 5
    all_users_pAT5 = pAT5.reindex(df['userID'].unique(), fill_value=0)
    avg_pAT5 = all_users_pAT5.mean()
    # 计算pAT10
    pAT10 = (top10_per_user['change'] == 1).groupby(top10_per_user['userID']).sum() / 10
    all_users_pAT10 = pAT10.reindex(df['userID'].unique(), fill_value=0)
    avg_pAT10 = all_users_pAT10.mean()

    avg_pAT = (0.7 * avg_pAT5 + 0.3 * avg_pAT10) * 0.95
    del df['factor_value']
    return avg_pAT

def generate_initial_population(size, columns):
    population = []
    operators = ['+', '-', '*', '/','%','//','**']
    for _ in range(size):
        const_factor = round(random.uniform(0, 1), 6)
        expr = f"{const_factor} * {random.choice(columns)}"
        for _ in range(random.randint(1, 7)):  # Generate expressions with depth between 1 and 5
            const_factor = round(random.uniform(0, 1), 6)
            expr += f" {random.choice(operators)} {const_factor} * {random.choice(columns)}"
        population.append(expr)
    return population


def select(population, fitnesses, num_parents):
    parents = sorted(zip(population, fitnesses), key=lambda x: x[1], reverse=True)[:num_parents]
    return [expr for expr, _ in parents]

def crossover(parent1, parent2):
    parts1 = parent1.split(' ')
    parts2 = parent2.split(' ')
    if len(parts1) > 1 and len(parts2) > 1:
        crossover_point = random.randint(1, min(len(parts1), len(parts2)) - 1)
        child = ' '.join(parts1[:crossover_point] + parts2[crossover_point:])
        return child
    return parent1

def mutate(expression, columns, mutation_rate=0.1):
    parts = expression.split(' ')
    if random.random() < mutation_rate:
        index = random.randint(0, len(parts) - 1)
        if index % 2 == 0:
            const_factor = round(random.uniform(0, 1), 4)
            parts[index] = f"{const_factor} * {random.choice(columns)}"
        else:
            parts[index] = random.choice(['+', '-', '*', '/','%','//','**'])
    return ' '.join(parts)


def genetic_algorithm(df, columns, population_size, num_generations, num_parents, mutation_rate):
    population = generate_initial_population(population_size, columns)

    for generation in range(num_generations):
        #fitnesses = [nget_average_pAT10(df, expr) for expr in population]
        #fitnesses = [fget_average_pAT10(df, expr) for expr in population]
        fitnesses = [nnget_average_pAT10(df, expr) for expr in population]
        max_index = fitnesses.index(max(fitnesses))
        best_expr = population[max_index]
        best_fitness = fitnesses[max_index]
        
        print(f'Generation {generation + 1}: Best fitness {best_fitness}, Best expression {best_expr}')

        parents = select(population, fitnesses, num_parents)
        next_population = parents[:2]  # Keep the best individuals

        while len(next_population) < population_size:
            parent1, parent2 = random.sample(parents, 2)
            child = crossover(parent1, parent2)
            child = mutate(child, columns, mutation_rate)
            next_population.append(child)

        population = next_population

    best_expr = select(population, [nnget_average_pAT10(df, expr) for expr in population], 1)[0]
    return best_expr

columns = ['nFriends_x', 'nFriends_y', 'cn', 'per_fName_y', 'per_gender_y', 'per_homeCity_y', 'per_location_y', 'per_privacy_y', 'per_homeProvince_y']
best_expression = genetic_algorithm(df10_standardized, columns, population_size=15000, num_generations=400, num_parents=1500, mutation_rate=0.65)
print('Best Expression:', best_expression)

Generation 1: Best fitness 0.7879999999999999, Best expression 0.498113 * per_privacy_y + 0.598688 * per_gender_y % 0.426673 * nFriends_y % 0.548239 * cn + 0.245971 * per_homeCity_y % 0.571953 * nFriends_x
Generation 2: Best fitness 0.7879999999999999, Best expression 0.498113 * per_privacy_y + 0.598688 * per_gender_y % 0.426673 * nFriends_y % 0.548239 * cn + 0.245971 * per_homeCity_y % 0.571953 * nFriends_x
Generation 3: Best fitness 0.7909999999999999, Best expression 0.179188 * cn + 0.2536 * cn % 0.015116 * per_gender_y * 0.441305 * per_homeProvince_y // 0.442413 * per_location_y % 0.084673 * cn
Generation 4: Best fitness 0.792, Best expression 0.681094 * cn * 0.758735 * per_homeProvince_y / 0.969575 * per_homeProvince_y % 0.333063 * 0.810825 * nFriends_y % 0.227484 * cn
Generation 5: Best fitness 0.792, Best expression 0.681094 * cn * 0.758735 * per_homeProvince_y / 0.969575 * per_homeProvince_y % 0.333063 * 0.810825 * nFriends_y % 0.227484 * cn
Generation 6: Best fitness 0.797, Be

KeyboardInterrupt: 

In [1]:
import random
import cudf as pd
import cupy as np
df10_standardized = pd.read_csv("cudf.csv", sep='\t')
df10_truth= pd.read_csv("cudf_tru20_al.csv", sep='\t')

def get_pAT(df, factor_expression, weight_pAT5=0.7, weight_pAT10=0.3):
    df['factor_value'] = df.eval(factor_expression)
    sorted_df = df.sort_values(['userID', 'factor_value'], ascending=[True, False])
    top10_per_user = sorted_df.groupby('userID').head(10)
    top10_per_user['relevant'] = top10_per_user['isFriend']
    user_relevant_counts = top10_per_user.groupby('userID')['relevant'].sum().reset_index(name='relevant_count')
    user_relevant_counts['pAT5'] = user_relevant_counts['relevant_count'].clip(upper=5) / 5.0
    user_relevant_counts['pAT10'] = user_relevant_counts['relevant_count'] / 10.0
    average_pAT = (user_relevant_counts['pAT5'] * weight_pAT5 + user_relevant_counts['pAT10'] * weight_pAT10).mean()
    return average_pAT

def nnget_average_pAT10(df, factor_expression):
    df['factor_value'] = df.eval(factor_expression)
    isFriend_zero_df = df[df['isFriend'] == 0]
    top10_per_user = isFriend_zero_df.sort_values(['userID', 'factor_value'], ascending=[True, False]).groupby('userID').head(10)
    pAT5 = (top10_per_user.groupby('userID').head(5)['change'] == 1).groupby(top10_per_user['userID']).sum() / 5
    all_users_pAT5 = pAT5.reindex(df['userID'].unique(), fill_value=0)
    avg_pAT5 = all_users_pAT5.mean()
    pAT10 = (top10_per_user['change'] == 1).groupby(top10_per_user['userID']).sum() / 10
    all_users_pAT10 = pAT10.reindex(df['userID'].unique(), fill_value=0)
    avg_pAT10 = all_users_pAT10.mean()
    return avg_pAT5, avg_pAT10

def fitness_function(df_standardized, df_truth, factor_expression):
    avg_pAT1 = get_pAT(df_standardized, factor_expression)
    avg_pAT5, avg_pAT10 = nnget_average_pAT10(df_truth, factor_expression)
    avg_pAT2 = ((0.7 * avg_pAT5) + (0.3 * avg_pAT10))*0.95
    return (0.7 * avg_pAT1) + (0.3 * avg_pAT2)

def generate_initial_population(size, columns):
    population = []
    operators = ['+', '-', '*', '/','%','//','**']
    for _ in range(size):
        const_factor = round(random.uniform(0, 1), 6)
        expr = f"{const_factor} * {random.choice(columns)}"
        for _ in range(random.randint(1, 7)):  # Generate expressions with depth between 1 and 5
            const_factor = round(random.uniform(0, 1), 6)
            expr += f" {random.choice(operators)} {const_factor} * {random.choice(columns)}"
        population.append(expr)
    return population


def select(population, fitnesses, num_parents):
    parents = sorted(zip(population, fitnesses), key=lambda x: x[1], reverse=True)[:num_parents]
    return [expr for expr, _ in parents]

def crossover(parent1, parent2):
    parts1 = parent1.split(' ')
    parts2 = parent2.split(' ')
    if len(parts1) > 1 and len(parts2) > 1:
        crossover_point = random.randint(1, min(len(parts1), len(parts2)) - 1)
        child = ' '.join(parts1[:crossover_point] + parts2[crossover_point:])
        return child
    return parent1

def mutate(expression, columns, mutation_rate=0.1):
    parts = expression.split(' ')
    if random.random() < mutation_rate:
        index = random.randint(0, len(parts) - 1)
        if index % 2 == 0:
            const_factor = round(random.uniform(0, 1), 4)
            parts[index] = f"{const_factor} * {random.choice(columns)}"
        else:
            parts[index] = random.choice(['+', '-', '*', '/','%','//','**'])
    return ' '.join(parts)


def genetic_algorithm(df_standardized, df_truth, columns, population_size, num_generations, num_parents, mutation_rate):
    population = generate_initial_population(population_size, columns)
    for generation in range(num_generations):
        fitnesses = [fitness_function(df_standardized, df_truth, expr) for expr in population]
        max_index = fitnesses.index(max(fitnesses))
        best_expr = population[max_index]
        best_fitness = fitnesses[max_index]
        print(f'Generation {generation + 1}: Best fitness {best_fitness}, Best expression {best_expr}')

        parents = select(population, fitnesses, num_parents)
        next_population = parents[:2]  # Keep the best individuals

        while len(next_population) < population_size:
            parent1, parent2 = random.sample(parents, 2)
            child = crossover(parent1, parent2)
            child = mutate(child, columns, mutation_rate)
            next_population.append(child)
        population = next_population
    best_expr = select(population, [fitness_function(df_standardized, df_truth, expr) for expr in population], 1)[0]
    return best_expr

columns = ['nFriends_x', 'nFriends_y', 'cn', 'per_fName_y', 'per_gender_y', 'per_homeCity_y', 'per_location_y', 'per_privacy_y', 'per_homeProvince_y']
best_expression = genetic_algorithm(df10_standardized, df10_truth, columns, population_size=520, num_generations=1000, num_parents=52, mutation_rate=0.4)
print('Best Expression:', best_expression)

Generation 1: Best fitness 0.7938268595041322, Best expression 0.221718 * cn + 0.543768 * per_privacy_y
Generation 2: Best fitness 0.7975004132231405, Best expression 0.49676 * per_location_y % 0.430454 * cn + 0.7661 + cn
Generation 3: Best fitness 0.801490909090909, Best expression 0.17463 * 0.662 * per_fName_y + 0.952888 * cn
Generation 4: Best fitness 0.8038987603305785, Best expression 0.221718 * cn + 0.898039 * cn + 0.0575 * cn * per_homeCity_y + 0.636472 * per_privacy_y
Generation 5: Best fitness 0.8062681818181817, Best expression 0.396713 * cn + 0.537553 * per_fName_y * 0.082127 * cn
Generation 6: Best fitness 0.8128111570247931, Best expression 0.627792 * cn + 0.278125 * cn + 0.119332 * per_homeCity_y + 0.7892 * per_location_y * 0.0054 + per_privacy_y + 0.087997 * 0.6942 * per_fName_y
Generation 7: Best fitness 0.8133607438016527, Best expression 0.627792 * cn + 0.278125 % cn + 0.119332 * per_homeCity_y + 0.636472 * 0.0054 + per_privacy_y + 0.087997 * 0.6942 * per_fName_y
Gene

724