In [171]:
import csv
import pandas as pd
import math
import random


In [154]:
df = pd.read_csv('data.csv')    

In [116]:
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="cynthia-app")

In [119]:
df['latitude'] = df.apply(lambda row: geolocator.geocode(row['hometown']).latitude, axis = 1)

In [123]:
df['longitude'] = df.apply(lambda row: geolocator.geocode(row['hometown']).longitude, axis = 1)

### ID data

In [77]:
# create id dict
info_by_name = {}

for i in range(len(df)):
    info_by_name[df.iloc[i]['name'].strip(' ')] = {'id': i}
    
respondent_names = list(info_by_name.keys())
num_respondents = len(respondent_names)

In [78]:
for i in range(len(df)):
    s = df.iloc[i]['roommates']
    if type(s) != str: continue
    s = s.replace('\n', ', ')
    split_list = s.split(', ')
    roommates_list = []
    for s in split_list:
        cleaned_name = s.strip(',').strip(' ')
        if cleaned_name not in info_by_name:
            info_by_name[cleaned_name] = {'id': len(info_by_name), 'connections': set()}
        roommates_list.append(cleaned_name)
        
    s = df.iloc[i]['people']
    if type(s) != str: continue
    s = s.replace('\n', ', ')
    split_list = s.split(', ')
    people_list = []
    for s in split_list:
        cleaned_name = s.strip(',').strip(' ')
        if cleaned_name not in info_by_name:
            info_by_name[cleaned_name] = {'id': len(info_by_name), 'connections': set()}
        people_list.append(cleaned_name)

### Connections

In [79]:
roommates_respondents = []
friendships_respndents = []
roommates_other = []
friendships_other = []

for i in range(len(df)):
    respondent_name = df.iloc[i]['name']
    respondent_id = i
    
    s = df.iloc[i]['roommates']
    if type(s) != str: continue
    s = s.replace('\n', ', ')
    split_list = s.split(', ')
    
    for s in split_list:
        roommate_name = s.strip(',').strip(' ')
        roommate_id = info_by_name[roommate_name]['id']
        if roommate_id < num_respondents:
            roommates_respondents.append([min(respondent_id, roommate_id), max(respondent_id, roommate_id)])
        else:
            roommates_other.append([respondent_id, roommate_id])
            info_by_name[roommate_name]['connections'].add(respondent_id)
            
    s = df.iloc[i]['people']
    if type(s) != str: continue
    s = s.replace('\n', ', ')
    split_list = s.split(', ')
    for s in split_list:
        friend_name = s.strip(',').strip(' ')
        friend_id = info_by_name[friend_name]['id']
        if friend_id < num_respondents:
            friendships_respndents.append([min(respondent_id, friend_id), max(respondent_id, friend_id)])
        else:
            friendships_other.append([respondent_id, friend_id])
            info_by_name[friend_name]['connections'].add(respondent_id)

In [67]:
roommates_respondents = list(list(i) for i in set(tuple(i) for i in roommates_respondents))
friendships_respndents = list(list(i) for i in set(tuple(i) for i in friendships_respndents))

num_connections = sum([len(roommates_respondents), len(friendships_respndents), 
                       len(roommates_other), len(friendships_other)])
print(num_connections, 'connections visualized!!')

665 connections visualized!!


### Hometown

In [None]:
# do some force field stuff for lat + longitude

## Compile Respondent Data

#### Concentration Encoding

In [155]:
conc_to_id = {}
for curr in df['concentration']:
    if 'Secondary' in curr:
        curr = curr.replace(', Secondary in ', ' + ')
    curr_list = curr.split(' + ')
    for c in curr_list:
        if c[-1] == " ":
            c = c[:-1]
        if c not in conc_to_id:
            conc_to_id[c] = len(conc_to_id)

#### State/Country Encoding

In [156]:
state_to_id = {'International': 0}
for curr in df['hometown']:
    s = curr.split(', ')
    if len(s) == 3:
        state = s[1]
        if state not in state_to_id:
            state_to_id[state] = len(state_to_id)

In [157]:
data = []
for i in range(num_respondents):
    row = df.iloc[i]
    
    # get state or country 
    where_from = ''
    s = row['hometown'].split(', ')
    if len(s) == 3:
        where_from = state_to_id[s[1]]
    else:
        where_from = '0'
        
    # get concentration encoding
    conc_code = []
    conc = row['concentration']
    secondary_code = -1
    if 'Secondary' in conc:
        conc_secondary_split = conc.split(', Secondary in ')
        secondary = conc_secondary_split[-1]
        if secondary[-1] == " ": secondary = secondary[:-1]
        secondary_code = conc_to_id[secondary]
        conc = conc_secondary_split[0]
    conc_list = conc.split(' + ')
    for c in conc_list:
        if c[-1] == " ": c = c[:-1]
        conc_code.append(conc_to_id[c])
    
    x = 0 # row['latitude']
    y = 0 # row['longitude']
    person_info = [i, row['name'], x, y, where_from, row['class'], conc_code, secondary_code]
    
    data.append(person_info)

In [160]:
respondent_data = pd.DataFrame(data, columns=['id', 'name', 'x', 'y', 'from', 
                                              'class', 'concentration', 'secondary'])
respondent_data

Unnamed: 0,id,name,x,y,from,class,concentration,secondary
0,0,Dylan Wilson,0,0,1,2025,[0],-1
1,1,Luke Tomes,0,0,2,2024,[1],-1
2,2,Cynthia Chen,0,0,3,2024,[2],3
3,3,Karley Merkley,0,0,0,2023,[4],-1
4,4,Sarah Mrad,0,0,0,2024,[5],-1
5,5,Audrey Gunawan,0,0,4,2025,"[6, 0]",-1
6,6,Robin Robinson,0,0,5,2023,[3],1
7,7,Kira Nagoshi,0,0,6,2024,[7],-1
8,8,Saylor Willauer,0,0,5,2024,[7],-1
9,9,Lily Roberts,0,0,7,2025,[8],-1


## Compile data for non-respondents

In [173]:
data = []
for name in info_by_name:
    if name in respondent_names:
        continue
    row = info_by_name[name]
    curr_data = [row['id'], name]
    
    conns = list(row['connections'])
    num_connections = len(row['connections'])
    x = 0
    y = 0
    if num_connections > 1:
        sum_x = 0
        sum_y = 0
        for c in conns:
            sum_x += respondent_data.iloc[c]['x']
            sum_y += respondent_data.iloc[c]['x']
        x = sum_x / num_connections
        y = sum_y / num_connections
        # can add randomness factor here
    else:
        single_conn = conns[0]
        theta = random.randrange(0, 360)
        r = random.randrange(6, 8)
        x = respondent_data.iloc[c]['x'] + r*math.cos(theta)
        y = respondent_data.iloc[c]['x'] + r*math.sin(theta)
    print([x, y])

[0.0, 0.0]
[0.0, 0.0]
[0.0, 0.0]
[-4.11878778812772, -4.362979160661894]
[0.0, 0.0]
[0.0, 0.0]
[0.0, 0.0]
[0.0, 0.0]
[0.0, 0.0]
[0.0, 0.0]
[0.0, 0.0]
[-4.1711999126410975, 2.757007669336975]
[0.1547245091414652, 4.99760545924455]
[4.452925180882696, 4.021375055060249]
[0.0, 0.0]
[4.748388489412716, 1.5661439121654257]
[0.0, 0.0]
[0.0, 0.0]
[0.0, 0.0]
[-0.7711083312154715, -4.940181367270851]
[-6.0979281937567835, -3.437334976953241]
[-2.2532252156455805, 5.56084311301613]
[1.3332146617996865, 4.818976931420439]
[-4.11878778812772, -4.362979160661894]
[0.0, 0.0]
[0.0, 0.0]
[0.0, 0.0]
[0.0, 0.0]
[-1.0301258709346381, -5.910908617973311]
[0.0, 0.0]
[0.0, 0.0]
[3.7107709840689127, 3.3511458792168733]
[0.0, 0.0]
[0.0, 0.0]
[0.0, 0.0]
[5.906291703795029, -1.0562756788272682]
[-3.0152659148723115, -5.18730869166365]
[0.0, 0.0]
[-0.7152603893916941, 5.957214330151908]
[0.0, 0.0]
[0.0, 0.0]
[-6.128216213305935, 3.3830409460977955]
[0.0, 0.0]
[0.0, 0.0]
[0.0, 0.0]
[-5.609338235623412, -2.1296301

In [126]:
# If number connections > 1, take average location of connections to get location
# Otherwise random around a circle of single connection



In [None]:
other_data = pd.DataFrame(data, columns=['id', 'name', 'x', 'y'])
other_data