In [19]:
import pandas as pd
from tqdm import tqdm

In [20]:
df = pd.read_csv('data.csv')

In [21]:
df.head()

Unnamed: 0,Distance in Kilometres,Ahmedabad,Bangalore,Bhubaneshwar,Bombay,Calcutta,Chandigarh,Cochin,Delhi,Hyderabad,...,Jaipur,Kanpur,Lucknow,Madras,Nagpur,Nasik,Panjim,Patna,Pondicherry,Pune
0,Agartala,3305,3824,2286,3593,1863,2998,4304,2708,3330,...,2801,2281,2252,3493,2696,3365,3507,1681,3661,3442
1,Agra,878,1848,1578,1202,1300,448,2278,200,1246,...,230,290,369,2048,770,1005,1715,885,2210,1214
2,Ahmedabad,-,1490,1697,552,2068,1157,1845,911,1436,...,648,1168,1247,1821,965,504,1165,1656,1818,664
3,Allahabad,1251,1686,1090,1457,817,912,2216,650,1084,...,713,193,234,2011,608,1155,1419,402,1077,1364
4,Amritsar,1356,2496,2224,1849,1919,239,3163,445,1892,...,706,926,939,2688,1416,1665,2237,1531,2856,1862


In [22]:
# assertions to check validity of data
for i in range(1, len(df.columns)):
    assert (df.columns[i] in df['Distance in Kilometres'].values)
    cities = df['Distance in Kilometres']
    dist = df[df.columns[i]]
    for j in range(len(dist)):
        if (cities[j] not in df.columns):
            continue
        dist_1 = dist[j]
        dist_2 = df.iloc[j][df.columns[i]]
        assert (dist_1 == dist_2)

In [23]:
# seperating class 1 cities in a new dataframe
df_ = {'Distance in Kilometres': list(df.columns[1:])}
for city in df['Distance in Kilometres']:
    df_[city] = []
for from_city in df.columns[1:]:
    for idx in range(len(df[from_city])):
        to_city_dist = df[from_city][idx]
        df_[df['Distance in Kilometres'][idx]].append(to_city_dist)
df_ = pd.DataFrame(df_)

In [24]:
# remove class 1 from class 2
for i in range(1, len(df.columns)):
    city = df.columns[i]
    idx = df['Distance in Kilometres'][df['Distance in Kilometres'] == city].index[0]
    df.drop(idx, inplace=True)

In [25]:
print('Class 1 cities')
print(df_)
print('Class 2 cities')
print(df)

Class 1 cities
   Distance in Kilometres Agartala  Agra Ahmedabad Allahabad Amritsar Asansol  \
0               Ahmedabad     3305   878         -      1251     1356    1842   
1               Bangalore     3824  1848      1490      1686     2496    2187   
2            Bhubaneshwar     2286  1578      1697      1090     2224     523   
3                  Bombay     3593  1202       552      1457     1849    2040   
4                Calcutta     1863  1300      2068       817     1919     226   
5              Chandigarh     2998   448      1157       912      239    1503   
6                  Cochin     4304  2278      1845      2216     3163    2544   
7                   Delhi     2708   200       911       650      445    1262   
8               Hyderabad     3330  1246      1436      1084     1892    1693   
9                  Indore     2891   591       442       803     1258    1394   
10                 Jaipur     2801   230       648       713      706    1304   
11           

In [26]:
df_.to_csv('class_1_cities.csv', index=False)
df.to_csv('class_2_cities.csv', index=False)

## Heuristic formation

In [27]:
df1_h = df.rename(columns={'Distance in Kilometres': 'Heuristic'}).reset_index(drop=True).copy()
df2_h = df_.rename(columns={'Distance in Kilometres': 'Heuristic'}).reset_index(drop=True).copy()

In [28]:
import requests
def get_geodist(c1, c2):
    URL = f'https://www.distance24.org/route.json?stops={c1}|{c2}'
    r = requests.get(url=URL)
    assert (r.status_code == 200)
    data = r.json()
    return data['distance']

In [29]:
for i in tqdm(range(len(df1_h))):
    city_1 = df1_h['Heuristic'][i]
    for city_2 in df1_h.columns[1:]:
        df1_h[city_2][i] = get_geodist(city_1, city_2)

100%|██████████| 27/27 [06:55<00:00, 15.40s/it]


In [30]:
for i in tqdm(range(len(df2_h))):
    city_1 = df2_h['Heuristic'][i]
    for city_2 in df2_h.columns[1:]:
        df2_h[city_2][i] = get_geodist(city_1, city_2)

100%|██████████| 20/20 [11:30<00:00, 34.52s/it]


In [31]:
df2_h.to_csv('heuristic_1_cities.csv', index=False)
df1_h.to_csv('heuristic_2_cities.csv', index=False)