### Data Cleaning

In [1]:
import pandas as pd

df_task = pd.read_csv("TaskData.csv")

df_worker = pd.read_csv("MemberData.csv")
df_worker["Member gps latitude"] = df_worker["Member location (GPS)"]
df_worker["Member gps longitude"] = df_worker["Member location (GPS)"]

#new dataframe to split column value
new = df_worker["Member location (GPS)"].str.split(" ", n = 1, expand = True)

#seperate worker latitude column using new dataframe
df_worker["Member gps latitude"] = new[0]

#separate worker longitude column using new dataframe
df_worker["Member gps longitude"] = new[1]

#drop old Member location (GPS) column
df_worker.drop(columns =["Member location (GPS)"], inplace = True) 

#print first 5 entries of worker
print(df_worker.head())


  Member number  Booking task limit Booking task start time  Reputation value  \
0         B0001                 114                06:30:00        67997.3868   
1         B0002                 163                06:30:00        37926.5416   
2         B0003                 139                06:30:00        27953.0363   
3         B0004                  98                06:30:00        25085.6986   
4         B0005                  66                06:30:00        20919.0667   

  Member gps latitude Member gps longitude  
0           22.947097           113.679983  
1           22.577792           113.966524  
2           23.192458           113.347272  
3           23.255965            113.31875  
4            33.65205            116.97047  


### Create matrix to store distance values

In [2]:
import numpy

worker_count = df_worker.shape[0]
task_count = df_task.shape[0]

distance_matrix = numpy.empty((worker_count, task_count))                # worker count to task count matrix
distance_matrix.fill(0)
distance_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

### Set Threshold and Calculate Haversine distance

In [3]:

threshold = 75.0

from haversine import haversine


for worker_index, worker_row in df_worker.iterrows():
    worker_location = (float(worker_row["Member gps latitude"]), float(worker_row["Member gps longitude"]))
    
    for task_index, task_row in df_task.iterrows():
        task_location = (float(task_row["Task gps latitude"]), float(task_row["Task gps longitude"]))
        
        H_distance = haversine(worker_location, task_location)
        if H_distance <= threshold:
            distance_matrix[worker_index][task_index] = H_distance
        else:
            distance_matrix[worker_index][task_index] = float("inf")
        
        

### Convert matrix to dataframe and clean up data

In [4]:
df_distance = pd.DataFrame(distance_matrix)
df_distance = df_distance.dropna()
print(df_distance)

            0          1          2          3          4          5    \
0     52.402952  39.429391  50.059721  71.820740  51.323682  71.940835   
1      1.959023  12.346800   0.968056  28.585563   2.655389  28.293055   
2           inf        inf        inf        inf        inf        inf   
3           inf        inf        inf        inf        inf        inf   
4           inf        inf        inf        inf        inf        inf   
5           inf        inf        inf        inf        inf        inf   
6           inf        inf        inf        inf        inf        inf   
7           inf        inf        inf        inf        inf        inf   
8           inf  72.874710        inf        inf        inf        inf   
9           inf  65.189643        inf        inf        inf        inf   
10          inf        inf        inf        inf        inf        inf   
11     3.243543  15.344712   3.077266  29.750773   1.209828  29.385865   
12          inf        inf        inf 

### Worker Reputation (W_rep)

In [5]:
from sklearn import preprocessing

reputation = df_worker["Reputation value"]
print("Worker reputation value before normalization:")
print(reputation.head())


Worker reputation value before normalization:
0    67997.3868
1    37926.5416
2    27953.0363
3    25085.6986
4    20919.0667
Name: Reputation value, dtype: float64


In [6]:
df_worker["Reputation value"] = df_worker["Reputation value"].apply(lambda x: (x - min(df_worker["Reputation value"])) * 100/ (max(df_worker["Reputation value"]) - min(df_worker["Reputation value"])))
print("Worker reputation value: after normalization")
print(df_worker["Reputation value"].head())


Worker reputation value: after normalization
0    100.000000
1     55.776469
2     41.108986
3     36.892151
4     30.764516
Name: Reputation value, dtype: float64


### Add Reputation and Preference data to Haversine df_distance matrix

In [7]:
final_matrix = df_distance
final_matrix['Reputation'] = df_worker["Reputation value"]                           # Added reputation column
final_matrix['Preference'] = numpy.random.randint(0, 6, final_matrix.shape[0])          # Willingness to do work

print(final_matrix)



              0          1          2          3          4          5  \
0     52.402952  39.429391  50.059721  71.820740  51.323682  71.940835   
1      1.959023  12.346800   0.968056  28.585563   2.655389  28.293055   
2           inf        inf        inf        inf        inf        inf   
3           inf        inf        inf        inf        inf        inf   
4           inf        inf        inf        inf        inf        inf   
5           inf        inf        inf        inf        inf        inf   
6           inf        inf        inf        inf        inf        inf   
7           inf        inf        inf        inf        inf        inf   
8           inf  72.874710        inf        inf        inf        inf   
9           inf  65.189643        inf        inf        inf        inf   
10          inf        inf        inf        inf        inf        inf   
11     3.243543  15.344712   3.077266  29.750773   1.209828  29.385865   
12          inf        inf        inf 

### Calcuate Score factor

In [9]:
score_matrix = numpy.empty((worker_count, task_count))
#print(final_matrix[0][1876])

for i in range(worker_count):
    for j in range(task_count):
        if final_matrix['Preference'][i]!=0 and final_matrix['Reputation'][i]!=0:
            score =  final_matrix[j][i] /(final_matrix['Preference'][i] * final_matrix['Reputation'][i])
            score_matrix[i][j] = score
        else:
            score_matrix[i][j] = 999.0
        

In [12]:
print(pd.DataFrame(score_matrix).head())


          0           1           2           3           4           5    \
0  999.000000  999.000000  999.000000  999.000000  999.000000  999.000000   
1    0.007025    0.044272    0.003471    0.102500    0.009522    0.101452   
2         inf         inf         inf         inf         inf         inf   
3         inf         inf         inf         inf         inf         inf   
4         inf         inf         inf         inf         inf         inf   

          6           7           8           9       ...             825  \
0  999.000000  999.000000  999.000000  999.000000     ...      999.000000   
1    0.011671    0.007020    0.040531    0.020718     ...             inf   
2         inf         inf         inf         inf     ...        0.308309   
3         inf         inf         inf         inf     ...        0.376667   
4         inf         inf         inf         inf     ...             inf   

          826         827         828         829         830         831 

### Substitute inf with a high number (999)

In [13]:
score_matrix[score_matrix == float("inf")] = 999.0
print(pd.DataFrame(score_matrix).head())

          0           1           2         3           4           5    \
0  999.000000  999.000000  999.000000  999.0000  999.000000  999.000000   
1    0.007025    0.044272    0.003471    0.1025    0.009522    0.101452   
2  999.000000  999.000000  999.000000  999.0000  999.000000  999.000000   
3  999.000000  999.000000  999.000000  999.0000  999.000000  999.000000   
4  999.000000  999.000000  999.000000  999.0000  999.000000  999.000000   

          6          7           8           9       ...             825  \
0  999.000000  999.00000  999.000000  999.000000     ...      999.000000   
1    0.011671    0.00702    0.040531    0.020718     ...      999.000000   
2  999.000000  999.00000  999.000000  999.000000     ...        0.308309   
3  999.000000  999.00000  999.000000  999.000000     ...        0.376667   
4  999.000000  999.00000  999.000000  999.000000     ...      999.000000   

          826         827         828         829         830         831  \
0  999.000000  

### Hungarian 

In [14]:
cost = score_matrix
from scipy.optimize import linear_sum_assignment
row_ind, col_ind = linear_sum_assignment(cost)
print(col_ind)                                     # prints task number assigned to corresponding worker shown in row_ind

print(cost[row_ind, col_ind].sum())


[796 568 296 577 428 298 305 405 430 297 291 542 301 294 550 682 776 545
 556 744 583 267 300 643 543 546 736 511 303 277 664 661 404 372 593 645
 672 456 409 562 188 270 537 495 198 762 663 667 591 224 504 675 527 592
 458 733 786 654 820 203 574 731 197 218 387 831 807 821 538 800 632 531
 207 199  24 389 204 690 406 785 603 221 443 609 516 437 652 798  17 819
 544 244 564 498  98 101 624 526 275  62 799 724 100 295 779 768  37 614
 318 681   3 648 679 416 673 509  53 502 665 384 822 584 540 514 579 662
 620  40  19 116 613 808 340 525 414 791 767 489 327 455   0 642 266  60
  91 494  20 824 302  72 208   5 790 441 279 650 563 332 415 691 604 598
 361 211 307 357 756 605 397 559 293 692 444 436 424  16 177 260  67  55
 433 512 711 383 344 590 232  95  30 633  64 189 795 569  97 810  73 817
 549 607 308 105 280 157 398  35 710 103 330 774  47 465 388 171 201 187
 640 329 426 210 523 730 671 610 159 281 833 635  51 113 231 173 180 143
 185 183 317 369 378 747 274 268 292 364 250 120 46

### Results

In [15]:
print("Tasks given: ", score_matrix.shape[1])
total = score_matrix.shape[1]
i = 0
for j in col_ind:
    if score_matrix[i][j] == 999.0:
        total -= 1
    i+=1
        
print("Tasks assigned: ", total)

Tasks given:  835
Tasks assigned:  441
