In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import SVR
from sklearn.multioutput import MultiOutputRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split

In [None]:
base_path = '/content/drive/My Drive/SCET/Data/'
response_path = base_path + 'SCET Friendship questionnaire (Responses) - Form Responses 1.csv'

# Import Data
Simulating recieving 3 tables:
1. Individual data: what each user fills out about themselves and activity enjoyment level ratings in the survey2. Activity data: if this data came from a real app (like the mockup from the presentation), this comes from after a pair of users perform an activity and rate their enjoyment level of that activity.3. Relationship data: to keep track of who is friends with who.

## Generate Correct Tables
Just need to run once.

In [None]:
deleted_features = ['Favorite movie genre?', 
                    'What qualities do you look for in a friend?', 
                    'Social distance run']
feedback_features = ['Timestamp', 'Email Address', 
                     'Any comments on the overall structure of the survey?', 
                     'Are the questions relevant to qualities you look for in a friend?', 
                     'Any suggestions on question we should be asking?', 
                     'What has been your favorite activity recently and why? ', 
                     'What are some virtual activities that you have been enjoying recently? ']
user_features = ['Age', 'Gender', 'College major?', 'How outdoorsy are you?',
                'When is your preferred time to hang out with friends? ',
                'What is your preferred way of spending time with friends?',
                'How often do you like to spend time with your friends?',
                'How many people do you like to spend time with at once?',
                'What is your top love language?', 'Introvert or extrovert?']
activites_features = ['Rank these activities on how much you enjoy them? (5 is most enjoyable) [Hiking]',
                      'Rank these activities on how much you enjoy them? (5 is most enjoyable) [Journaling]',
                      'Rank these activities on how much you enjoy them? (5 is most enjoyable) [Reading nonfiction]',
                      'Rank these activities on how much you enjoy them? (5 is most enjoyable) [Drawing]',
                      'Rank these activities on how much you enjoy them? (5 is most enjoyable) [Hanging out with friends. (pre-COVID)]',
                      'Social distance exercise (walk, run, etc.)', 
                      'Netflix party',
                      'Video chat hangout', 
                      'Wine tasting/Cocktail shake up',
                      'Trivia contests', 
                      'Virtual escape room', 
                      'Arts and crafts ']
user_id = ['First Name', 'Last Name']

In [None]:
data = pd.read_csv(response_path)
data['UserID'] = range(data.shape[0])
full_data = pd.read_csv(response_path)
full_data = full_data.drop(range(6)).drop(deleted_features, axis=1)

In [None]:
names_to_id = {}
def anonymize(data, user_id):
  data['UserID'] = range(data.shape[0])  
  for _, r in data[['UserID'] + user_id].iterrows():
    names_to_id["{} {}".format(r[1], r[2]).title()] = r[0]
  return data.drop(user_id, axis=1)

In [None]:
individual_data = full_data[user_id + user_features].copy()
individual_data = anonymize(individual_data, user_id)
individual_data.to_csv(base_path + 'individual.csv')
individual_data.head(2)

Unnamed: 0,Age,Gender,College major?,How outdoorsy are you?,When is your preferred time to hang out with friends?,What is your preferred way of spending time with friends?,How often do you like to spend time with your friends?,How many people do you like to spend time with at once?,What is your top love language?,Introvert or extrovert?,UserID
6,22,Female,Econ + Data Sci,Very outdoorsy,"Weekends during the day, Weekends at night","Grabbing foods or drinks together, Doing an ac...",Once a week,The more the merrier,Quality time,Extrovert,0
7,20,Female,STEM,Very outdoorsy,"Weekdays during the day, Weekends during the d...","Grabbing foods or drinks together, Doing an ac...",2-3 times a week,Small groups (up to 5 people),Acts of service,Introvert,1


In [None]:
def clean_responses(responses):
  new_responses = []
  for response in responses:
    if "[" in response:
      new_responses.append(response[response.index("[")+1:-1])
    else:
      new_responses.append(response) 
  return new_responses

In [None]:
individual_activity = full_data[user_id + activites_features].copy()  
individual_activity = anonymize(individual_activity, user_id)  
individual_activity.columns = clean_responses(individual_activity.columns)              
print(individual_activity.shape)
individual_activity.head(2)

(22, 13)


Unnamed: 0,Hiking,Journaling,Reading nonfiction,Drawing,Hanging out with friends. (pre-COVID),"Social distance exercise (walk, run, etc.)",Netflix party,Video chat hangout,Wine tasting/Cocktail shake up,Trivia contests,Virtual escape room,Arts and crafts,UserID
6,5,4,3,5,5,3.0,5.0,5.0,5.0,4.0,4.0,,0
7,5,3,2,3,5,5.0,3.0,3.0,5.0,5.0,3.0,4.0,1


In [None]:
activities_list = [c for c in individual_activity.columns if c != "UserID"]
user1 = []
user2 = []
user1_ID = []
user2_ID = []       
user1_enjoyment = []
user2_enjoyment = []
activity_col = []
for u1 in individual_data['UserID']:
  for u2 in individual_data['UserID']:
    if u1 < u2:
        user1 += [u1]
        user2 += [u2]
        u1_acts = individual_activity.query('UserID == {}'.format(u1))
        u2_acts = individual_activity.query('UserID == {}'.format(u2))
        for activity in activities_list:
            user1_ID += [u1]
            user2_ID += [u2]
            user1_enjoyment += [u1_acts[activity].iloc[0]]
            user2_enjoyment += [u2_acts[activity].iloc[0]]
            activity_col += [activity]
relationships = pd.DataFrame({'User1': user1, 'User2': user2})
activities_df = pd.DataFrame({'User1_ID': user1_ID, 'User2_ID':user2_ID, 
                              'User1_Enjoyment': user1_enjoyment, 'User2_Enjoyment': user2_enjoyment, 
                              'Activity_Name': activity_col})

In [None]:
relationships.to_csv(base_path + 'relationships.csv')
relationships.head(2)

Unnamed: 0,User1,User2
0,0,1
1,0,2


In [None]:
activities_df.to_csv(base_path + 'activities.csv')
activities_df.tail(2)

Unnamed: 0,User1_ID,User2_ID,User1_Enjoyment,User2_Enjoyment,Activity_Name
2770,20,21,2.0,3.0,Virtual escape room
2771,20,21,3.0,3.0,Arts and crafts


## Read CSV
Instead of creating them.

In [None]:
individual_data = pd.read_csv(base_path+'individual.csv')
relationships_df = pd.read_csv(base_path + 'relationships.csv')
activities_df = pd.read_csv(base_path + 'activities.csv')

# Preprocessing

## Preprocessing Group Data

In [None]:
relationship_data_extended = relationships.merge(individual_data, left_on='User1', right_on='UserID').merge(individual_data, left_on='User2', right_on='UserID').drop(columns=['User1', 'User2'])
relationship_data_extended.tail(2)

Unnamed: 0,Unnamed: 0_x,Age_x,Gender_x,College major?_x,How outdoorsy are you?_x,When is your preferred time to hang out with friends? _x,What is your preferred way of spending time with friends?_x,How often do you like to spend time with your friends?_x,How many people do you like to spend time with at once?_x,What is your top love language?_x,Introvert or extrovert?_x,UserID_x,Unnamed: 0_y,Age_y,Gender_y,College major?_y,How outdoorsy are you?_y,When is your preferred time to hang out with friends? _y,What is your preferred way of spending time with friends?_y,How often do you like to spend time with your friends?_y,How many people do you like to spend time with at once?_y,What is your top love language?_y,Introvert or extrovert?_y,UserID_y
229,25,21,Male,STEM,Very outdoorsy,"Weekdays at night, Weekends during the day, We...","Grabbing foods or drinks together, Doing an ac...",Everyday,Small groups (up to 5 people),Physical touch,Introvert,19,27,19,Male,STEM,Somewhat outdoorsy,Weekends at night,"Doing an activity or exploring, Playing sport,...",2-3 times a week,Small groups (up to 5 people),Physical touch,Introvert,21
230,26,20,Female,STEM,Somewhat outdoorsy,"Weekdays during the day, Weekdays at night, We...","Grabbing foods or drinks together, Doing an ac...",2-3 times a week,Small groups (up to 5 people),Quality time,Introvert,20,27,19,Male,STEM,Somewhat outdoorsy,Weekends at night,"Doing an activity or exploring, Playing sport,...",2-3 times a week,Small groups (up to 5 people),Physical touch,Introvert,21


In [None]:
full_relationship_data = relationship_data_extended.merge(activities_df, left_on=['UserID_x', 'UserID_y'], right_on=['User1_ID', 'User2_ID'])
full_relationship_data.tail(2)

Unnamed: 0.1,Unnamed: 0_x,Age_x,Gender_x,College major?_x,How outdoorsy are you?_x,When is your preferred time to hang out with friends? _x,What is your preferred way of spending time with friends?_x,How often do you like to spend time with your friends?_x,How many people do you like to spend time with at once?_x,What is your top love language?_x,Introvert or extrovert?_x,UserID_x,Unnamed: 0_y,Age_y,Gender_y,College major?_y,How outdoorsy are you?_y,When is your preferred time to hang out with friends? _y,What is your preferred way of spending time with friends?_y,How often do you like to spend time with your friends?_y,How many people do you like to spend time with at once?_y,What is your top love language?_y,Introvert or extrovert?_y,UserID_y,Unnamed: 0,User1_ID,User2_ID,User1_Enjoyment,User2_Enjoyment,Activity_Name
2770,26,20,Female,STEM,Somewhat outdoorsy,"Weekdays during the day, Weekdays at night, We...","Grabbing foods or drinks together, Doing an ac...",2-3 times a week,Small groups (up to 5 people),Quality time,Introvert,20,27,19,Male,STEM,Somewhat outdoorsy,Weekends at night,"Doing an activity or exploring, Playing sport,...",2-3 times a week,Small groups (up to 5 people),Physical touch,Introvert,21,2770,20,21,2.0,3.0,Virtual escape room
2771,26,20,Female,STEM,Somewhat outdoorsy,"Weekdays during the day, Weekdays at night, We...","Grabbing foods or drinks together, Doing an ac...",2-3 times a week,Small groups (up to 5 people),Quality time,Introvert,20,27,19,Male,STEM,Somewhat outdoorsy,Weekends at night,"Doing an activity or exploring, Playing sport,...",2-3 times a week,Small groups (up to 5 people),Physical touch,Introvert,21,2771,20,21,3.0,3.0,Arts and crafts


# Input Data

In [None]:
X = full_relationship_data.drop(columns=['UserID_x', 'UserID_y', 'User1_ID', 'User2_ID', 'Activity_Name'])
print(X.shape)
X.head(2)

(2772, 25)


Unnamed: 0.1,Unnamed: 0_x,Age_x,Gender_x,College major?_x,How outdoorsy are you?_x,When is your preferred time to hang out with friends? _x,What is your preferred way of spending time with friends?_x,How often do you like to spend time with your friends?_x,How many people do you like to spend time with at once?_x,What is your top love language?_x,Introvert or extrovert?_x,Unnamed: 0_y,Age_y,Gender_y,College major?_y,How outdoorsy are you?_y,When is your preferred time to hang out with friends? _y,What is your preferred way of spending time with friends?_y,How often do you like to spend time with your friends?_y,How many people do you like to spend time with at once?_y,What is your top love language?_y,Introvert or extrovert?_y,Unnamed: 0,User1_Enjoyment,User2_Enjoyment
0,6,22,Female,Econ + Data Sci,Very outdoorsy,"Weekends during the day, Weekends at night","Grabbing foods or drinks together, Doing an ac...",Once a week,The more the merrier,Quality time,Extrovert,7,20,Female,STEM,Very outdoorsy,"Weekdays during the day, Weekends during the d...","Grabbing foods or drinks together, Doing an ac...",2-3 times a week,Small groups (up to 5 people),Acts of service,Introvert,0,5.0,5.0
1,6,22,Female,Econ + Data Sci,Very outdoorsy,"Weekends during the day, Weekends at night","Grabbing foods or drinks together, Doing an ac...",Once a week,The more the merrier,Quality time,Extrovert,7,20,Female,STEM,Very outdoorsy,"Weekdays during the day, Weekends during the d...","Grabbing foods or drinks together, Doing an ac...",2-3 times a week,Small groups (up to 5 people),Acts of service,Introvert,1,4.0,3.0


In [None]:
def preprocess(data):
  columns = list(data.columns)
  individual_columns = ['When is your preferred time to hang out with friends? ', 
                      'What is your preferred way of spending time with friends?',
                      'How often do you like to spend time with your friends?',
                      'How many people do you like to spend time with at once?',
                      'What is your top love language?', 'How outdoorsy are you?',
                      'Introvert or extrovert?']
  indices = []
  for col in individual_columns:
    indices += [columns.index(col + '_x')]
    indices += [columns.index(col + '_y')]
  data_pipeline = ColumnTransformer([
      ('categorical', OneHotEncoder(), indices),
      
  ])
  return data_pipeline.fit_transform(data)

In [None]:
X = preprocess(X)
X

<2772x71 sparse matrix of type '<class 'numpy.float64'>'
	with 38808 stored elements in Compressed Sparse Row format>

# Output Data

In [None]:
activities_list

['Hiking',
 'Journaling',
 'Reading nonfiction',
 'Drawing',
 'Hanging out with friends. (pre-COVID)',
 'Social distance exercise (walk, run, etc.)',
 'Netflix party',
 'Video chat hangout',
 'Wine tasting/Cocktail shake up',
 'Trivia contests',
 'Virtual escape room',
 'Arts and crafts ']

In [None]:
activity_to_features = {
  'HIKING':[1, 1, 1, 0, 0, 1, 1, 0],
  'JOURNALING':[0, 0, 1, 1, 1, 0, 0, 0],
  'READING NONFICTION':[0, 0, 1, 1, 0, 0, 0, 0],
  'DRAWING':[0, 0, 1, 0, 1, 0, 0, 0],
  'HANGING OUT WITH FRIENDS. (PRE-COVID)':[1, 1, 1, 0, 0, 1, 1, 1],
  'SOCIAL DISTANCE EXERCISE (WALK, RUN, ETC.)':[1, 1, 1, 0, 0, 1, 1, 0],
  'NETFLIX PARTY':[0, 0, 0, 0, 1, 1, 0, 0],
  'VIDEO CHAT HANGOUT':[0, 0, 1, 0, 0, 1, 1, 0],
  'WINE TASTING/COCKTAIL SHAKE UP':[0, 0, 1, 0, 1, 1, 1, 1],
  'TRIVIA CONTESTS':[0, 0, 0, 0, 1, 1, 1, 0],
  'VIRTUAL ESCAPE ROOM':[0, 0, 1, 0, 1, 1, 1, 0],
  'ARTS AND CRAFTS':[0, 0, 1, 0, 1, 1, 0, 0]
}

suggestions_from_features = {
  'PICNIC':[1, 0, 1, 0, 1, 1, 1, 1],
  'GROUP GAMES (AMONG US, CODE NAMES, ETC.)':[0, 0, 0, 0, 1, 1, 1, 0],
  'GRABBING FOOD OR DRINKS TOGETHER':[1, 0, 0, 0, 0, 0, 1, 1],
  'STUDY TOGETHER':[0, 0, 0, 1, 0, 1, 0, 0],
  'VIDEO GAMES':[0, 0, 0, 0, 1, 1, 1, 0],
  'COOKING/BAKING CLASS':[0, 0, 1, 0, 1, 1, 1, 1],
  'PAINTING SOCIAL':[1, 0, 0, 0, 1, 1, 1, 0],
  'BOOK CLUB':[0, 0, 0, 1, 1, 1, 1, 0],
  'KARAOKE':[0, 0, 0, 0, 1, 1, 1, 0],
  'COOKING/BAKING COMPETITION':[0, 0, 1, 0, 1, 1, 1, 1],
  'WORKOUT SESSION':[1, 1, 1, 0, 0, 1, 0, 0],
  'SELF-CARE SHEET MASK + TEA SESSION':[0, 0, 1, 0, 1, 0, 1, 1],
  'ONLINE SHOPPING SESSION':[0, 0, 0, 0, 1, 1, 0, 0]
}


In [None]:
y = pd.DataFrame(np.array([activity_to_features[a.strip().upper()] for a in full_relationship_data['Activity_Name']]))
y

Unnamed: 0,0,1,2,3,4,5,6,7
0,1,1,1,0,0,1,1,0
1,0,0,1,1,1,0,0,0
2,0,0,1,1,0,0,0,0
3,0,0,1,0,1,0,0,0
4,1,1,1,0,0,1,1,1
...,...,...,...,...,...,...,...,...
2767,0,0,1,0,0,1,1,0
2768,0,0,1,0,1,1,1,1
2769,0,0,0,0,1,1,1,0
2770,0,0,1,0,1,1,1,0


# Train-Val-Test Split

In [None]:
seed = 135

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=seed)
print("Train:      ", X_train.shape, y_train.shape)
print("Validation: ", X_val.shape, y_val.shape)
print("Test:       ", X_test.shape, y_test.shape)

Train:       (1662, 71) (1662, 8)
Validation:  (555, 71) (555, 8)
Test:        (555, 71) (555, 8)


# ML Model
1. Support Vector Regression
2. Decision Tree Regression

In [None]:
max_iter = 10000

In [None]:
svr_reg = MultiOutputRegressor(SVR(kernel='rbf', max_iter=max_iter))
svr_reg.fit(X_train, y_train)

MultiOutputRegressor(estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3,
                                   epsilon=0.1, gamma='scale', kernel='rbf',
                                   max_iter=10000, shrinking=True, tol=0.001,
                                   verbose=False),
                     n_jobs=None)

In [None]:
svr_reg.score(X_test, y_test)

-0.21058945460871092

In [None]:
dt_reg = DecisionTreeRegressor(random_state = seed)  
dt_reg.fit(X_train, y_train)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=135, splitter='best')

In [None]:
dt_reg.score(X_test, y_test)



-0.2629815132937462

As a result of the poor score, we pivoted from this to a different model.