In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import hamming

In [2]:
def get_cont_cat(dataframe, var_type):
    '''
    Function to split and return the dataframe by the variable type needed.
    
    I/P:
    'dataframe': whole data as a pandas dataframe
    'var_type': variable type, "cont" or "cat", string
    
    O/P:
    'dataframe': dataframe with only columns of the needed variable type
    '''
    # Convert any series to dataframe
    if not isinstance(dataframe, pd.DataFrame):
        print('ip is not dataframe')
    cont_cols = ['dist_from_uni', 'budget']
    
    if var_type == 'cont':
        return dataframe[cont_cols]
    
    elif var_type == 'cat':
        return dataframe.drop(labels = cont_cols, axis = 1)
    
    else: raise ValueError('Variable type should be either "cont" or "cat"')

In [3]:
def get_cont_dist(person, database, metric):
    
    '''
    Function that returns a distance matrix for continuous features of the data.
    Feature normalization happens in here.
    I/P:
    'person': 1 x 3 numpy array of 1 person
    'database': (total data length) x 3 numpy array
    'metric': distance metric to be used for computation. One of: euclidean.
    
    O/P:
    'distance_matrix': 1 x (total data length) numpy array of distances
    '''
#     print('sdkfjsdkf')

    to_std = np.vstack((person, database))
    
    all_std = StandardScaler().fit_transform(to_std)
    person_std = all_std[0,:].reshape(1,-1)
    database_std = all_std[1:,:]
    
    if metric == 'euclidean':
        cont_distance_matrix = euclidean_distances(person_std, database_std)
        return cont_distance_matrix

In [4]:
def get_cat_dist(person, database, metric):
    '''
    Function that returns a distance matrix for categorical features of the data.
    I/P:
    'person': 1 x 3 numpy array of 1 person
    'database': (total data length) x 3 numpy array
    'metric': distance metric to be used for computation. One of: hamming
    
    O/P:
    'distance_matrix': 1 x (total data length) numpy array of distances
    '''
    cat_distance_matrix = []
    if metric == 'hamming':
        for index, c_row in database.iterrows():
            cat_distance_matrix.append(hamming(person, c_row))
    return(np.array(cat_distance_matrix))

In [5]:
def findRoommate(new_person, database, n_roommates, alpha, beta):
    # Split data by gender to reduce computations
    database_g = database[database['Gender'] == new_person.iloc[0]['Gender']]
#     print(len(database_g))
    name_g = [name_list[i] for i in list(database_g.index)]
#     print(name_g)

    # Split new datapoint into continuous and categorical sets
    new_person_cont = get_cont_cat(new_person, 'cont')
    new_person_cat = get_cont_cat(new_person, 'cat')
    
#     print(new_person_cat.shape, new_person_cont.shape)
    
    # Split database into continuous and categorical sets
    database_cont = get_cont_cat(database_g, 'cont')
    database_cat = get_cont_cat(database_g, 'cat')
    
#     print(database_cat.shape, database_cont.shape)
    
    
    # Get distances for both continuous and categorical sets
    dist_cont = get_cont_dist(new_person_cont, database_cont, 'euclidean')
#     print(dist_cont.shape, np.argsort(dist_cont))
    
    

    dist_cat = get_cat_dist(new_person_cat, database_cat, 'hamming')
    
#     print(dist_cont + dist_cat)
    # Create final distance matrix of weighted average
    # Have to experiment with different alpha(cont_coeff) and beta(cat_coeff)
    # We good till here
    final_dist = alpha*dist_cont + beta*dist_cat
#     print(final_dist)
    # Sort the distance matrix to get top n roommates
    top_n_matches = np.argsort(final_dist)[0][1 : n_roommates + 1]
    print(top_n_matches)
    print(list(name_g[j] for j in top_n_matches))

In [2]:
def normalize(df, features):
    result = df.copy()
    for feature_name in features:
        max_value = df[feature_name].max()
        min_value = df[feature_name].min()
        result[feature_name] = ((df[feature_name] - min_value) / (max_value - min_value))*2-1
    return result

In [3]:
meta_data = pd.read_csv('../data/roommates2.csv')

In [4]:
meta_data.columns

Index(['No', 'Uni', 'Email address', 'Gender', 'Last Name', 'First Name',
       'Smoking', 'Alcohol', 'Nationality', 'School', 'Major', 'Habit',
       'Roommate', 'budget', 'dist_from_uni', 'school_label'],
      dtype='object')

In [5]:
meta_data = meta_data.drop(labels=['Uni', 'Email address', 'No', 'Last Name', 'First Name', 'Nationality', 'School', 'Major'], axis=1)

In [6]:
meta_data

Unnamed: 0,Gender,Smoking,Alcohol,Habit,Roommate,budget,dist_from_uni,school_label
0,0,1,1,1,1,1000,5.0,0
1,0,1,0,0,1,1100,2.5,1
2,1,0,0,1,1,1300,2.0,1
3,0,1,0,1,1,1600,3.0,2
4,0,0,0,0,1,1500,3.3,3
...,...,...,...,...,...,...,...,...
3485,1,0,1,0,1,1600,2.0,4
3486,0,1,1,1,1,1650,3.5,17
3487,1,1,0,0,1,1800,8.0,3
3488,1,0,1,0,1,1100,4.5,10


In [7]:
features = ['budget', 'dist_from_uni']
normalized_data = normalize(meta_data, features)

In [8]:
normalized_data

Unnamed: 0,Gender,Smoking,Alcohol,Habit,Roommate,budget,dist_from_uni,school_label
0,0,1,1,1,1,-1.000000,0.142857,0
1,0,1,0,0,1,-0.866667,-0.571429,1
2,1,0,0,1,1,-0.600000,-0.714286,1
3,0,1,0,1,1,-0.200000,-0.428571,2
4,0,0,0,0,1,-0.333333,-0.342857,3
...,...,...,...,...,...,...,...,...
3485,1,0,1,0,1,-0.200000,-0.714286,4
3486,0,1,1,1,1,-0.133333,-0.285714,17
3487,1,1,0,0,1,0.066667,1.000000,3
3488,1,0,1,0,1,-0.866667,0.000000,10


In [9]:
test_person = normalized_data.iloc[0]

In [10]:
test_person

Gender           0.000000
Smoking          1.000000
Alcohol          1.000000
Habit            1.000000
Roommate         1.000000
budget          -1.000000
dist_from_uni    0.142857
school_label     0.000000
Name: 0, dtype: float64

In [11]:
x = test_person
dist_list = []
for i in range(normalized_data.shape[0]):
    y = normalized_data.iloc[i]
    dist = np.sqrt(np.sum([(a-b)*(a-b) for a, b in zip(x, y)]))
    dist_list.append(dist)
# dist_list

In [13]:
sorted(dist_list)[:10]

[0.0,
 0.0,
 0.9458825918426771,
 0.9458825918426771,
 0.9831228876620145,
 1.0,
 1.0400156984686455,
 1.0400156984686455,
 1.0400156984686455,
 1.0400156984686455]

In [19]:
normalized_data

Unnamed: 0,Gender,Smoking,Alcohol,Habit,Roommate,budget,dist_from_uni,school_label
0,0,1,1,1,1,-1.000000,0.142857,0
1,0,1,0,0,1,-0.866667,-0.571429,1
2,1,0,0,1,1,-0.600000,-0.714286,1
3,0,1,0,1,1,-0.200000,-0.428571,2
4,0,0,0,0,1,-0.333333,-0.342857,3
...,...,...,...,...,...,...,...,...
3485,1,0,1,0,1,-0.200000,-0.714286,4
3486,0,1,1,1,1,-0.133333,-0.285714,17
3487,1,1,0,0,1,0.066667,1.000000,3
3488,1,0,1,0,1,-0.866667,0.000000,10


In [47]:
# K-Means
from sklearn.cluster import KMeans
n_clusters = int(meta_data.shape[0] / 10)
km = KMeans(n_clusters = n_clusters
           )
mat = normalized_data.values
km.fit(mat)
# Get cluster assignment labels
labels = km.labels_
# Format results as a DataFrame
results = pd.DataFrame([normalized_data.index,labels]).T

In [48]:
results

Unnamed: 0,0,1
0,0,69
1,1,86
2,2,198
3,3,6
4,4,43
...,...,...
3485,3485,77
3486,3486,57
3487,3487,282
3488,3488,228


In [103]:
results[results[1]==results[1][0]]

Unnamed: 0,0,1
0,0,69
364,364,69
1044,1044,69
1122,1122,69
1648,1648,69
2144,2144,69


In [104]:
# KNN
# from sklearn.neighbors import KNeighborsClassifier
# classifier = KNeighborsClassifier(n_neighbors=5)
# classifier.fit(X_train, y_train)

In [97]:
# cosine similarity
from sklearn.metrics.pairwise import cosine_similarity
test_person = normalized_data.iloc[0]
cs_list = []
for i in range(normalized_data.shape[0]):
    cs = np.min(cosine_similarity([test_person, normalized_data.iloc[i]]))
    cs_list.append(cs)
cs_list

[1.0000000000000002,
 0.6155402517746794,
 0.5051754582400932,
 0.521208738844587,
 0.17922794487477398,
 0.1999963785501523,
 0.1221179499408405,
 0.2995629578652558,
 0.23991508040392964,
 0.20927053683778565,
 0.5673911317380161,
 0.6860765465784374,
 0.21597917154983914,
 0.18300380247989123,
 0.4922171239114316,
 0.32292769381481934,
 0.08740472332889541,
 0.09395200092858867,
 0.13300295535850248,
 0.09033812802602269,
 0.30931200448771096,
 0.09475426502019338,
 0.11942657930457069,
 0.5516072467156634,
 -0.001553045475413456,
 0.10188000848454466,
 0.08190911870671226,
 0.024833703843272673,
 0.22971608228446722,
 0.20927053683778565,
 0.10401083892934829,
 0.13882111503555827,
 0.05678546652020564,
 0.8434347122457899,
 0.09791688735812476,
 0.1504462041343757,
 0.023706590394559787,
 0.0951444407683182,
 0.07627905557850198,
 0.09200970765603807,
 0.051241384240921124,
 0.19503575564490427,
 0.13361642696492532,
 0.18526495282864058,
 0.03997818964611819,
 0.09893225810803907

In [109]:
s = np.array(cs_list)
sort_index = np.argsort(s)
print(sort_index[::-1])

[   0 2144 3392 ... 1324 1733   75]


In [107]:
normalized_data.iloc[0]

Gender           0.000000
Smoking          1.000000
Alcohol          1.000000
Habit            1.000000
Roommate         1.000000
budget          -1.000000
dist_from_uni    0.142857
school_label     0.000000
Name: 0, dtype: float64

In [111]:
normalized_data.iloc[3392]

Gender           0.000000
Smoking          1.000000
Alcohol          1.000000
Habit            1.000000
Roommate         1.000000
budget          -0.600000
dist_from_uni   -0.714286
school_label     0.000000
Name: 3392, dtype: float64

In [13]:
test_p_cont = np.array(get_cont_cat(test_person, 'cont')).reshape((1,2))
db_cont = np.array(get_cont_cat(meta_data, 'cont'))

ip is not dataframe


In [14]:
test_p_cont.shape

(1, 2)

In [15]:
euclidean_distances(test_p_cont, db_cont).shape

(1, 3490)

In [17]:
findRoommate(test_person, meta_data, 5, 1, 1)

TypeError: string indices must be integers

In [126]:
meta_data['Gender']

0       0
1       0
2       1
3       0
4       0
       ..
3485    1
3486    0
3487    1
3488    1
3489    0
Name: Gender, Length: 3490, dtype: int64