PARTE PFAIR

In [1]:
import numba
import pandas as pd
import pandas.core.groupby
import pandas.core.series
import logging
import sys
import numpy as np
import time
from sortedcontainers import SortedDict
from sklearn.preprocessing import minmax_scale
import math

In [2]:
#Preprocessing

logging.basicConfig(format='%(asctime)s -> %(message)s', level=logging.INFO)
logging.info("Loading data...")

alg = "MF"
df_rec = pd.read_csv(alg+'.csv', usecols=['user_id', 'item_id', 'score'])
df_items = pd.read_csv('tracks.csv')

df_merge1 = pd.merge(df_rec, df_items, on='item_id', how='left')
data = df_merge1[['user_id', 'item', 'score', 'group']]
data = data.rename(index=str, columns={"item": "item_id"})

def log2(n):
    return math.log10(n) / math.log10(2)

def get_exposure(position):
    return 1 / log2(1 + position)

data['id'] = range(len(data))
data['position'] = data.groupby('user_id').cumcount() + 1
data['exposure'] = data['position'].apply(get_exposure)

final = data[['id','user_id', 'item_id', 'score', 'exposure', 'group', 'position']]

print(final)
#final.to_csv(alg+'_R.csv', index=False)

2024-02-09 13:05:57,389 -> Loading data...


                id    user_id  item_id     score  exposure          group  \
0                0      18262     1610  1.950241  1.000000         Europe   
1                1      18262     3704  1.917848  0.630930         Europe   
2                2      18262      978  1.796800  0.500000         Europe   
3                3      18262     4421  1.763362  0.430677         Europe   
4                4      18262     1404  1.736914  0.386853         Europe   
...            ...        ...      ...       ...       ...            ...   
31003995  31003995  999939894     1530  1.492879  0.100387  North America   
31003996  31003996  999939894    74164  1.492259  0.100372         Europe   
31003997  31003997  999939894    10769  1.491575  0.100358  North America   
31003998  31003998  999939894     2072  1.491556  0.100343         Europe   
31003999  31003999  999939894     8730  1.491505  0.100329  North America   

          position  
0                1  
1                2  
2           

In [3]:
#VISIBILITY

#final = pd.read_csv(alg+'_R.csv')
amount = len(final)
logging.info("{} uploaded records".format(amount))

users = final.to_numpy()
items = df_items.to_numpy()

# Normalize rating (0.0-1.0 range)
users[:,3] = minmax_scale(users[:,3])

2024-02-09 13:08:08,400 -> 31004000 uploaded records


In [4]:
def get_continent_id(continent):
    continent_ids = {"Africa": 1, "Asia": 2, "Europe": 3,"North America": 4,
                     "Oceania": 5, "South America": 6}

    if continent in continent_ids:
        return continent_ids[continent]
    else:
        # If there are multiple continents for this item, we split them into a list
        if isinstance(continent, str):
            continents = continent.split("|")
            result = ""
            # Then we calculate an ID for each continent in the list and append them to a single value
            # For example, Africa|Europe|Oceania will become '135'
            for c in continents:
                result += str(get_continent_id(c))
            return int(result)
        else:
            return 0

# Number of items per continent
continent_dict = {1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0}

topk = 10
for i in range(len(users)):
    # We calculate numerical value for each continent ID
    item = int(users[i][2]-1)
    continent_name = items[item][-1]
    users[i][-2] = get_continent_id(continent_name)

    # Then we count the number of occurences for each item by continent
    if users[i][-1] <= topk:
        group = int(users[i][-2])
        continents = [int(d) for d in str(group)]
        for c in continents:
            if c in continent_dict:
                continent_dict[c] += 1
    i += 1

In [5]:
df_train = pd.read_csv( 'train.csv', usecols=['user', 'item', 'rating'] )

df_items.columns=['id','item', 'artist_id', 'group']

df_merge2 = pd.merge(df_train, df_items, on='item')
train = df_merge2[['user', 'id', 'rating', 'group']]
train = train.rename(index=str, columns={"id": "item"})
print(train)
#train.to_csv('train_R.csv', index=False)

              user    item  rating          group
0        864287919  332225       2  North America
1        864287919  142572       1  North America
2        183028821  142572       1  North America
3        646293561  142572       1  North America
4        488006481  142572       1  North America
...            ...     ...     ...            ...
1806688  266275331  411071       1  North America
1806689  266275331  411074       1         Europe
1806690  266275331  411075       1  North America
1806691  319628289  397679       1  North America
1806692  458765532  308827       1         Europe

[1806693 rows x 4 columns]


In [6]:
counter = {}

def count_groups(groups):
    global counter

    groups = str(groups).split("|")
    for g in groups:
        if g not in counter:
            counter[g] = 0
        counter[g] = counter[g] + 1

train["group"] = train["group"].apply(count_groups)

keys = counter.keys()
sorted_keys = sorted(keys)

sorted_counter = {}
for key in sorted_keys:
    sorted_counter[key] = counter[key]

print(sorted_counter)

total_t = sum(sorted_counter.values())

target = [g / total_t for g in sorted_counter.values()]

print (target)

{'Africa': 2928, 'Asia': 76092, 'Europe': 548450, 'North America': 1093495, 'Oceania': 41838, 'South America': 43890}
[0.0016206405847590045, 0.04211672929490511, 0.30356568603520356, 0.6052467131936637, 0.02315722704410766, 0.024293003847360897]


In [7]:
total = sum(continent_dict.values())
proportions = [c / total for c in continent_dict.values()]
initial_proportions = proportions.copy()
logging.info("Initial proportions: {}".format(initial_proportions))

target_proportions = target.copy()
logging.info("Target proportions: {}".format(target_proportions))

# The difference between target proportions and initial ones
# If the number is negative, we want to swap item out of Topk
# If the number is positive, we want to swap item into Topk
proportions_delta = np.array(target_proportions) - np.array(proportions)
logging.info("Proportions delta: {}".format(proportions_delta))

2024-02-09 13:09:38,630 -> Initial proportions: [0.0030705715391562377, 0.02636111469487808, 0.2595019997419688, 0.6898625983744033, 0.01180815378660818, 0.009395561862985421]
2024-02-09 13:09:38,634 -> Target proportions: [0.0016206405847590045, 0.04211672929490511, 0.30356568603520356, 0.6052467131936637, 0.02315722704410766, 0.024293003847360897]
2024-02-09 13:09:38,637 -> Proportions delta: [-0.00144993  0.01575561  0.04406369 -0.08461589  0.01134907  0.01489744]


In [8]:
def check_underrepresented(group):
    if group > len(proportions_delta):
        # If the item's group contains more than one continent, we need to split it
        continents = [int(d) for d in str(group)]
        sum_deltas = 0
        # To determine whether we need to swap, we calculate the sum of deltas for present continents
        # If the total is higher than 0, then the item is underrepresented and swap is beneficial
        for c in continents:
            sum_deltas += proportions_delta[c-1]
        return sum_deltas >= 0
    else:
        return proportions_delta[group-1] >= 0

topn = 1000
def precompute(numpy_data, num_users):
    global users

    for i in range(num_users):
        s1 = [] # List of candidates for swapping out
        s2 = [] # List of candidates for swapping in

        for j in range(topn):
            index = i*topn+j # Current item index
            group = int(users[index][-2]) # Current item group

            is_underrepresented = check_underrepresented(group)

            if (users[index][-1] <= topk and not is_underrepresented):
                s1.append(users[index])
            elif (users[index][-1] > topk and is_underrepresented):
                s2.append(users[index])

        k = 0

        while len(s2) >= 1:
            for n in range(len(s1)-1, -1, -1):
                item1 = s1[n]
                item2 = s2[0]

                # Table reference: id, id_user, id_item, score, exposure, group, position
                loss = item1[3] - item2[3]

                # We save information about the swap into the numpy array
                numpy_data[i*topn*topk+k][0] = i
                numpy_data[i*topn*topk+k][1] = users[index][1]
                numpy_data[i*topn*topk+k][2] = item1[0]
                numpy_data[i*topn*topk+k][3] = item2[0]
                numpy_data[i*topn*topk+k][4] = loss

                k += 1

            del s2[0]

    return numpy_data

In [9]:
logging.info("Calculating initial losses...")

start = time.time()

num_users = int(len(users) / topn)
numpy_data = precompute(np.zeros([len(users)*10, 5]), num_users)

end = time.time()
logging.info('Elapsed: {}'.format(end - start))

numpy_data = numpy_data[numpy_data[:,1] != 0] # Remove empty rows
numpy_data = numpy_data[numpy_data[:,4].argsort()] # Sort by loss

logging.info('Possible number of swaps: {}'.format(len(numpy_data)))

2024-02-09 13:09:55,914 -> Calculating initial losses...
2024-02-09 13:12:32,544 -> Elapsed: 156.62375497817993
2024-02-09 13:13:03,274 -> Possible number of swaps: 76234863


In [10]:
def update_proportions(group, is_underrepresented):
    global continent_dict

    continents = [int(d) for d in str(group)]
    value = 1 if is_underrepresented else -1
    for c in continents:
        continent_dict[c] += value

In [11]:
def rerank():
    global numpy_data, num_users, users, proportions, proportions_delta

    loss_total = 0.0
    i = 0
    completed_swaps = 0

    while(i < len(numpy_data)):
        item_1 = int(numpy_data[i][2]-1) # Item to swap out
        item_2 = int(numpy_data[i][3]-1) # Item to swap in

        group_1 = int(users[item_1][-2])
        group_2 = int(users[item_2][-2])

        if (check_underrepresented(group_1) == False and check_underrepresented(group_2) == True):
            # Preserve the original position values
            position_aux = users[item_1][-1]
            users[item_1][-1] = users[item_2][-1]
            users[item_2][-1] = position_aux

            # Preserve the original indexes
            index_aux = users[item_1][0]
            users[item_1][0] = users[item_2][0]
            users[item_2][0] = index_aux

            # Calculate loss when performing this swap
            loss = users[item_1][3] - users[item_2][3]

            # Swap the items
            users[[item_1, item_2]] = users[[item_2, item_1]]
            loss_total += loss

            # We recompute the current proportions and their delta
            update_proportions(group_1, False)
            update_proportions(group_2, True)
            total = sum(continent_dict.values())
            proportions = [c / total for c in continent_dict.values()]
            proportions_delta = np.array(target_proportions) - np.array(proportions)

            completed_swaps += 1

        i += 1

    logging.info('Completed swaps: {}'.format(completed_swaps))
    logging.info('Total iterations: {}'.format(i))
    return loss_total, users

In [13]:
start = time.time()

loss_total, result = rerank()

end = time.time()

logging.info('Elapsed: {}'.format(end - start))

continent_dict = {1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0}

for i in range(len(result)):
    # Then we count the number of occurences for each item by continent
    if result[i][-1] <= topk:
        group = int(result[i][-2])
        continents = [int(d) for d in str(group)]
        for c in continents:
            if c in continent_dict:
                continent_dict[c] += 1
    i += 1

total = sum(continent_dict.values())
proportions = [c / total for c in continent_dict.values()]

logging.info("Initial proportions: {}".format(initial_proportions))
logging.info("Current proportions: {}".format(proportions))
logging.info("Target proportions: {}".format(target_proportions))
original_delta = np.array(target_proportions) - np.array(initial_proportions)
proportions_delta = np.array(target_proportions) - np.array(proportions)
logging.info("Original delta: {}".format(original_delta))
logging.info("Proportions delta: {}".format(proportions_delta))
logging.info("Total loss: {}".format(loss_total))

df_vis = pd.DataFrame(data=result, columns=['id', 'user_id', 'item_id', 'score', 'exposure', 'group', 'position'])
df_vis = df_vis.astype({'id': 'Int64', 'user_id': 'Int64', 'item_id': 'Int64', 'group': 'Int64', 'position': 'Int64'})
df_vis_final = df_vis.loc[(df_vis['position'] <= topk)]

df_vis_final = df_vis_final.sort_values(by=['user_id', 'position'], ascending=[True, True])
print(df_vis_final)

#df_vis_final.to_csv(alg+'_V.csv', index=False, header = True)
logging.info('Ended process.')

2024-02-09 13:20:00,004 -> Completed swaps: 9484633
2024-02-09 13:20:00,007 -> Total iterations: 76234863
2024-02-09 13:20:00,009 -> Elapsed: 380.75543689727783
2024-02-09 13:20:07,618 -> Initial proportions: [0.0030705715391562377, 0.02636111469487808, 0.2595019997419688, 0.6898625983744033, 0.01180815378660818, 0.009395561862985421]
2024-02-09 13:20:07,618 -> Current proportions: [0.0017030060637337118, 0.0416752677073926, 0.29473616307573214, 0.61461101793317, 0.02242936395303832, 0.0248451812669333]
2024-02-09 13:20:07,619 -> Target proportions: [0.0016206405847590045, 0.04211672929490511, 0.30356568603520356, 0.6052467131936637, 0.02315722704410766, 0.024293003847360897]
2024-02-09 13:20:07,621 -> Original delta: [-0.00144993  0.01575561  0.04406369 -0.08461589  0.01134907  0.01489744]
2024-02-09 13:20:07,621 -> Proportions delta: [-8.23654790e-05  4.41461588e-04  8.82952296e-03 -9.36430474e-03
  7.27863091e-04 -5.52177420e-04]
2024-02-09 13:20:07,622 -> Total loss: 17425.63415232

                id    user_id  item_id     score  exposure  group  position
0                0      18262     4660  0.112689  0.100343      3         1
1                1      18262     4207  0.113931  0.100653      4         2
2                2      18262      978  0.236304       0.5      4         3
3                3      18262     4421  0.231886  0.430677      3         4
4                4      18262     1404  0.228391  0.386853      3         5
...            ...        ...      ...       ...       ...    ...       ...
31003005  31003005  999939894     1401  0.351461  0.356207      4         6
31003006  31003006  999939894     1717  0.349625  0.333333      3         7
31003007  31003007  999939894    13639  0.196252  0.100416      3         8
31003008  31003008  999939894     1072  0.197925  0.100833      4         9
31003009  31003009  999939894     2297  0.338099  0.289065      4        10

[310040 rows x 7 columns]


2024-02-09 13:20:36,474 -> Ended process.


In [14]:
#EXPOSURE
#df_vis_final = pd.read_csv(alg+'_V.csv')

amount = len(df_vis_final)
logging.info("{} uploaded records".format(amount))

users = df_vis_final.to_numpy()
items = df_items.to_numpy()

2024-02-09 13:21:11,051 -> 310040 uploaded records


In [15]:
def get_continent_id(continent):
    continent_ids = {"Africa": 1, "Asia": 2, "Europe": 3,"North America": 4, "Oceania": 5, "South America": 6}
    
    if continent in continent_ids:
        return continent_ids[continent]
    else:
        # If there are multiple continents for this item, we split them into a list
        if isinstance(continent, str):
            continents = continent.split("|")
            result = ""
            # Then we calculate an ID for each continent in the list and append them to a single value
            # For example, Africa|Europe|Oceania will become '135'
            for c in continents:
                result += str(get_continent_id(c))
            return int(result)
        else:
            return 0

# Number of items per continent
continent_dict = {1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0}
  
for i in range(len(users)):
    # We calculate numerical value for each continent ID
    item = int(users[i][2]-1)
    continent_name = items[item][-1]
    users[i][-2] = get_continent_id(continent_name)
    
    # Then we count the total exposure for each item by continent 
    if users[i][-1] <= topk:
        group = int(users[i][-2])
        continents = [int(d) for d in str(group)]
        for c in continents:
            if c in continent_dict:
                continent_dict[c] += users[i][4]
                
    i += 1

In [16]:
total = sum(continent_dict.values())
proportions = [c / total for c in continent_dict.values()]
initial_proportions = proportions.copy()
logging.info("Initial proportions: {}".format(initial_proportions))

target_proportions = target.copy()
logging.info("Target proportions: {}".format(target_proportions))

# The difference between target proportions and initial ones
# If the number is negative, we want to swap item out of Topk
# If the number is positive, we want to swap item into Topk
proportions_delta = np.array(target_proportions) - np.array(proportions)
logging.info("Proportions delta: {}".format(proportions_delta))

2024-02-09 13:21:39,866 -> Initial proportions: [0.0022465115592677147, 0.032424770115773986, 0.2727448292520351, 0.6567975295587672, 0.01669031142705647, 0.019096048087099544]
2024-02-09 13:21:39,869 -> Target proportions: [0.0016206405847590045, 0.04211672929490511, 0.30356568603520356, 0.6052467131936637, 0.02315722704410766, 0.024293003847360897]
2024-02-09 13:21:39,873 -> Proportions delta: [-0.00062587  0.00969196  0.03082086 -0.05155082  0.00646692  0.00519696]


In [17]:
# We determine if the current item's group is underrepresented or overrepresented
def check_underrepresented(group):
    if group > len(proportions_delta):
        # If the item's group contains more than one continent, we need to split it
        continents = [int(d) for d in str(group)]
        sum_deltas = 0
        # To determine whether we need to swap, we calculate the sum of deltas for present continents
        # If the total is higher than 0, then the item is underrepresented and swap is beneficial
        for c in continents:
            sum_deltas += proportions_delta[c-1]
        return sum_deltas >= 0
    else:
        return proportions_delta[group-1] >= 0

def precompute(numpy_data, num_users):
    global users
    
    for i in range(num_users):
        s1 = [] # List of candidates for swapping out
        s2 = [] # List of candidates for swapping in
        
        order = {} # Dictionary for saving the order of items for the current user
        
        #s1
        for j in range(topk):
            index = i*topk+j # Current item index
            item_id = int(users[index][0]) # Current item ID
            group = int(users[index][-2]) # Current item group
            
            is_underrepresented = check_underrepresented(group)
            
            if not is_underrepresented:
                s1.append(users[index])
            
            order[item_id] = group
        
        m = 0
        
        #s2
        for n in range(len(s1)-1, -1, -1):
            for j in range(topk):
                index = i*topk+j # Current item index
                item_id = int(users[index][0]) # Current item ID
                group = int(users[index][-2]) # Current item group
                item1 = s1[n]
                
                is_underrepresented = check_underrepresented(group)
                
                if (is_underrepresented and item_id > s1[n][0]):
                    s2.append([item_id, users[index][1], users[index][2], users[index][3], 
                               users[index][4], order[item_id], users[index][6]])
            
            k = 0
            
            while len(s2) >= 1:
                item1 = s1[n]
                acc_count = 0
                for item in s2:
                    item2 = item
                    #count = max(item1[3] - item2[3], 0) + acc_count
                    count = abs(item1[3] - item2[3]) + acc_count
                    numpy_data[i*1000+k+m] = [i, users[index][1], item1[0], item2[0], count+len(s1)-n]

                    order[int(item1[0])] = int(item2[-2])
                    order[int(item2[0])] = int(item1[-2])

                    item1 = np.array([item2[0], item1[1], item1[2], item1[3], item2[4], item1[5], item2[6]])

                    acc_count += count
                    k += 1
                del s2[0]
            m += k
    
    return numpy_data

In [18]:
logging.info("Calculating initial losses...")

start = time.time()

num_users = int(len(users) / topk)
numpy_data = precompute(np.zeros([len(users)*100, 5]), num_users)

end = time.time()
logging.info('Elapsed: {}'.format(end - start))

numpy_data = numpy_data[numpy_data[:,1] != 0] # Remove empty rows
numpy_data = numpy_data[numpy_data[:,4].argsort()] # Sort by loss

logging.info('Possible number of swaps: {}'.format(len(numpy_data)))

2024-02-09 13:21:53,424 -> Calculating initial losses...
2024-02-09 13:21:58,123 -> Elapsed: 4.694338083267212
2024-02-09 13:21:58,690 -> Possible number of swaps: 625114


In [19]:
def update_proportions(group, exp, is_underrepresented):
    global continent_dict
    
    continents = [int(d) for d in str(group)]
    value = exp if is_underrepresented else -exp
    for c in continents:
        continent_dict[c] += value

In [20]:
def rerank():
    global numpy_data, num_users, users, proportions, proportions_delta
    
    loss_total = 0.0
    i = 0
    completed_swaps = 0

    while(i < len(numpy_data)):       
        item_1 = int(numpy_data[i][0] * 10  + (numpy_data[i][2]-1) % 50) # Item to swap out
        item_2 = int(numpy_data[i][0] * 10  + (numpy_data[i][3]-1) % 50) # Item to swap in

        group_1 = int(users[item_1][-2])
        group_2 = int(users[item_2][-2])
        
        exp = users[item_1][4] - users[item_2][4]
        
        if (check_underrepresented(group_1) == False and check_underrepresented(group_2) == True):
            # Preserve the original position values
            position_aux = users[item_1][-1]
            users[item_1][-1] = users[item_2][-1]
            users[item_2][-1] = position_aux
            
            # Preserve the original indexes
            index_aux = users[item_1][0]
            users[item_1][0] = users[item_2][0]
            users[item_2][0] = index_aux
            
            # Preserve the original exposure values
            exp_aux = users[item_1][4]
            users[item_1][4] = users[item_2][4]
            users[item_2][4] = exp_aux

            # Calculate loss when performing this swap
            loss = users[item_1][3] - users[item_2][3]

            # Swap the items
            users[[item_1, item_2]] = users[[item_2, item_1]]
            loss_total += loss

            # We recompute the current proportions and their delta
            update_proportions(group_1, exp, False)
            update_proportions(group_2, exp, True)
            total = sum(continent_dict.values())
            proportions = [c / total for c in continent_dict.values()]
            proportions_delta = np.array(target_proportions) - np.array(proportions)

            completed_swaps += 1
            
        i += 1
        
    logging.info('Completed swaps: {}'.format(completed_swaps))
    logging.info('Total iterations: {}'.format(i))
    return loss_total, users

In [23]:
start = time.time()

loss_total, result = rerank()

end = time.time()

logging.info('Elapsed: {}'.format(end - start))

continent_dict = {1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0}

for i in range(len(result)):
    if result[i][-1] <= topk:
        group = int(result[i][-2])
        continents = [int(d) for d in str(group)]
        for c in continents:
            if c in continent_dict:
                continent_dict[c] += result[i][4]
                
    i += 1
    
total = sum(continent_dict.values())
proportions = [c / total for c in continent_dict.values()]

logging.info("Initial proportions: {}".format(initial_proportions))
logging.info("Current proportions: {}".format(proportions))
logging.info("Target proportions: {}".format(target_proportions))
original_delta = np.array(target_proportions) - np.array(initial_proportions)
proportions_delta = np.array(target_proportions) - np.array(proportions)
logging.info("Original delta: {}".format(original_delta))
logging.info("Proportions delta: {}".format(proportions_delta))
logging.info("Total loss: {}".format(loss_total))

df_exp = pd.DataFrame(data=result, columns=['id', 'user_id', 'item_id', 'score', 'exposure', 'group', 'position'])
df_exp = df_exp.astype({'id': 'Int64', 'user_id': 'Int64', 'item_id': 'Int64', 'group': 'Int64', 'position': 'Int64'})
df_exp_final = df_exp.loc[(df_exp['position'] <= topk)]

df_exp_final = df_exp_final.sort_values(by=['user_id', 'position'], ascending=[True, True])
print(df_exp_final)

df_exp_final.to_csv(alg+'_VE.csv', index=False, header = True)

logging.info('Ended process.')

2024-02-09 13:25:31,695 -> Completed swaps: 1571
2024-02-09 13:25:31,695 -> Total iterations: 625114
2024-02-09 13:25:31,696 -> Elapsed: 3.0819449424743652
2024-02-09 13:25:32,079 -> Initial proportions: [0.0022465115592677147, 0.032424770115773986, 0.2727448292520351, 0.6567975295587672, 0.01669031142705647, 0.019096048087099544]
2024-02-09 13:25:32,080 -> Current proportions: [0.0022093831015934417, 0.03696233129422562, 0.28837309325541327, 0.6299026609278, 0.01973339766334067, 0.022819133757627237]
2024-02-09 13:25:32,080 -> Target proportions: [0.0016206405847590045, 0.04211672929490511, 0.30356568603520356, 0.6052467131936637, 0.02315722704410766, 0.024293003847360897]
2024-02-09 13:25:32,081 -> Original delta: [-0.00062587  0.00969196  0.03082086 -0.05155082  0.00646692  0.00519696]
2024-02-09 13:25:32,081 -> Proportions delta: [-0.00058874  0.0051544   0.01519259 -0.02465595  0.00342383  0.00147387]
2024-02-09 13:25:32,081 -> Total loss: 37.037692623185826


              id    user_id  item_id     score  exposure  group  position
0              0      18262     4660  0.112689  0.100343      3         1
1              1      18262     4421  0.231886  0.100653      3         2
2              2      18262     1404  0.228391       0.5      3         3
3              3      18262     3706  0.227977  0.430677      3         4
4              4      18262      978  0.236304  0.386853      4         5
...          ...        ...      ...       ...       ...    ...       ...
310035  31003005  999939894     1401  0.351461  0.356207      4         6
310036  31003006  999939894    15425  0.196708  0.333333      4         7
310037  31003007  999939894    13639  0.196252  0.100416      3         8
310038  31003008  999939894     1072  0.197925  0.100833      4         9
310039  31003009  999939894     2297  0.338099  0.289065      4        10

[310040 rows x 7 columns]


2024-02-09 13:25:33,355 -> Ended process.
