In [28]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import hamming

In [89]:
def get_cont_cat(dataframe, var_type):
    '''
    Function to split and return the dataframe by the variable type needed.
    
    I/P:
    'dataframe': whole data as a pandas dataframe
    'var_type': variable type, "cont" or "cat", string
    
    O/P:
    'dataframe': dataframe with only columns of the needed variable type
    '''
    # Convert any series to dataframe
    if not isinstance(dataframe, pd.DataFrame):
        print('ip is not dataframe')
    cont_cols = ['dist_from_uni', 'budget']
    
    if var_type == 'cont':
        return dataframe[cont_cols]
    
    elif var_type == 'cat':
        return dataframe.drop(labels = cont_cols, axis = 1)
    
    else: raise ValueError('Variable type should be either "cont" or "cat"')

In [37]:
def get_cont_dist(person, database, metric):
    
    '''
    Function that returns a distance matrix for continuous features of the data.
    Feature normalization happens in here.
    I/P:
    'person': 1 x 3 numpy array of 1 person
    'database': (total data length) x 3 numpy array
    'metric': distance metric to be used for computation. One of: euclidean.
    
    O/P:
    'distance_matrix': 1 x (total data length) numpy array of distances
    '''
#     print('sdkfjsdkf')

    to_std = np.vstack((person, database))
    
    all_std = StandardScaler().fit_transform(to_std)
    person_std = all_std[0,:].reshape(1,-1)
    database_std = all_std[1:,:]
    
    if metric == 'euclidean':
        cont_distance_matrix = euclidean_distances(person_std, database_std)
        return cont_distance_matrix

In [38]:
def get_cat_dist(person, database, metric):
    '''
    Function that returns a distance matrix for categorical features of the data.
    I/P:
    'person': 1 x 3 numpy array of 1 person
    'database': (total data length) x 3 numpy array
    'metric': distance metric to be used for computation. One of: hamming
    
    O/P:
    'distance_matrix': 1 x (total data length) numpy array of distances
    '''
    cat_distance_matrix = []
    if metric == 'hamming':
        for index, c_row in database.iterrows():
            cat_distance_matrix.append(hamming(person, c_row))
    return(np.array(cat_distance_matrix))

In [118]:
def findRoommate(new_person, database, n_roommates, alpha, beta):
    # Split data by gender to reduce computations
    database_g = database[database['Gender'] == new_person.iloc[0]['Gender']]
#     print(len(database_g))
    name_g = [name_list[i] for i in list(database_g.index)]
#     print(name_g)

    # Split new datapoint into continuous and categorical sets
    new_person_cont = get_cont_cat(new_person, 'cont')
    new_person_cat = get_cont_cat(new_person, 'cat')
    
#     print(new_person_cat.shape, new_person_cont.shape)
    
    # Split database into continuous and categorical sets
    database_cont = get_cont_cat(database_g, 'cont')
    database_cat = get_cont_cat(database_g, 'cat')
    
#     print(database_cat.shape, database_cont.shape)
    
    
    # Get distances for both continuous and categorical sets
    dist_cont = get_cont_dist(new_person_cont, database_cont, 'euclidean')
#     print(dist_cont.shape, np.argsort(dist_cont))
    
    

    dist_cat = get_cat_dist(new_person_cat, database_cat, 'hamming')
    
#     print(dist_cont + dist_cat)
    # Create final distance matrix of weighted average
    # Have to experiment with different alpha(cont_coeff) and beta(cat_coeff)
    # We good till here
    final_dist = alpha*dist_cont + beta*dist_cat
#     print(final_dist)
    # Sort the distance matrix to get top n roommates
    top_n_matches = np.argsort(final_dist)[0][1 : n_roommates + 1]
    print(top_n_matches)
    print(list(name_g[j] for j in top_n_matches))

In [80]:
meta_data = pd.read_csv('./data/roommates2.csv')

In [81]:
meta_data.columns

Index(['No', 'Uni', 'Email address', 'Gender', 'Last Name', 'First Name',
       'Smoking', 'Alcohol', 'Nationality', 'School', 'Major', 'Habit',
       'Roommate', 'budget', 'dist_from_uni'],
      dtype='object')

In [82]:
meta_data = meta_data.drop(labels=['Email address', 'No', 'Last Name', 'First Name', 'Nationality', 'School', 'Major'], axis=1)

In [83]:
meta_data

Unnamed: 0,Uni,Gender,Smoking,Alcohol,Habit,Roommate,budget,dist_from_uni
0,hs2498,0,1,1,1,1,1000,5.0
1,ss4408,0,1,0,0,1,1100,2.5
2,bv1567,1,0,0,1,1,1300,2.0
3,ds7800,0,1,0,1,1,1600,3.0
4,ma9067,0,0,0,0,1,1500,3.3
...,...,...,...,...,...,...,...,...
3485,cd8293,1,0,1,0,1,1600,2.0
3486,wl1948,0,1,1,1,1,1650,3.5
3487,ka9393,1,1,0,0,1,1800,8.0
3488,rh6239,1,0,1,0,1,1100,4.5


In [60]:
test_person = meta_data.iloc[0]

In [61]:
test_person

Uni              hs2498
Gender                0
Smoking               1
Alchohol              1
Habit                 1
Roommate              1
budget             1000
dist_from_uni       5.0
Name: 0, dtype: object

In [109]:
test_p_cont = np.array(get_cont_cat(test_person, 'cont')).reshape((1,2))
db_cont = np.array(get_cont_cat(meta_data, 'cont'))

ip is not dataframe


In [108]:
test_p_cont.shape

(2, 1)

In [114]:
euclidean_distances(test_p_cont, db_cont).shape

(1, 3490)

In [119]:
findRoommate(test_person, meta_data, 5, 1, 1)

TypeError: string indices must be integers

In [126]:
meta_data['Gender']

0       0
1       0
2       1
3       0
4       0
       ..
3485    1
3486    0
3487    1
3488    1
3489    0
Name: Gender, Length: 3490, dtype: int64