## Assignment 2: Well-being from Instagram data


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import Pipeline


import pickle

In [2]:
#Read the individual data frames
anp_df = pd.read_pickle(r'anp.pickle') 
face_df = pd.read_pickle(r'face.pickle')
image_df = pd.read_pickle(r'image_data.pickle')
metrics_df = pd.read_pickle(r'image_metrics.pickle')
object_labels_df = pd.read_pickle(r'object_labels.pickle')
survey_df = pd.read_pickle(r'survey.pickle')

In [3]:
print(len(face_df))
print(len(anp_df))
print(len(image_df))
print(len(metrics_df))
print(len(object_labels_df))
print(len(survey_df))

86877
325941
41206
44218
172613
161


In [4]:
len(face_df.columns)

17

In [5]:
face_df.columns

Index(['image_id', 'face_id', 'face_gender', 'face_gender_confidence',
       'face_age_range_high', 'face_age_range_low', 'face_sunglasses',
       'face_beard', 'face_beard_confidence', 'face_mustache',
       'face_mustache_confidence', 'face_smile', 'face_smile_confidence',
       'eyeglasses', 'eyeglasses_confidence', 'face_emo', 'emo_confidence'],
      dtype='object')

In [6]:
# Merge them based on the image_id so that we have a large data frame containing all the elements
image_anp_frame = pd.merge(image_df, anp_df, how='inner', on='image_id')
im_anp_obj_frame = pd.merge(image_anp_frame, object_labels_df, how='inner', on='image_id')
im_anp_obj_face_frame = pd.merge(im_anp_obj_frame, face_df, how='inner', on='image_id')
im_anp_obj_face_metrics_frame = pd.merge(im_anp_obj_face_frame, metrics_df, how='inner', on='image_id')
survey_df['insta_user_id'] = pd.to_numeric(survey_df['insta_user_id'])
im_anp_obj_face_metrics_frame['user_id'] =  pd.to_numeric(im_anp_obj_face_metrics_frame['user_id'])
total_df = pd.merge(im_anp_obj_face_metrics_frame, survey_df, how='left', left_on='user_id', right_on='insta_user_id')

In [7]:
total_df.head()

Unnamed: 0,image_id,image_link,image_url,image_height,image_width,image_filter,image_posted_time_unix,image_posted_time,data_memorability,user_id,...,P,E,R,M,A,PERMA,N_EMO,P_EMO,imagecount,private_account
0,552382455733335946_263042348,https://www.instagram.com/p/eqdOq2JLeK/,https://scontent.cdninstagram.com/t51.2885-15/...,612.0,612.0,Normal,1380069141,25-09-2013 00:32:21,0.875568,263042348,...,5.666667,6.0,6.0,6.666667,7.0,,3.0,5.666667,73.0,public
1,552382455733335946_263042348,https://www.instagram.com/p/eqdOq2JLeK/,https://scontent.cdninstagram.com/t51.2885-15/...,612.0,612.0,Normal,1380069141,25-09-2013 00:32:21,0.875568,263042348,...,5.666667,6.0,6.0,6.666667,7.0,,3.0,5.666667,73.0,public
2,552382455733335946_263042348,https://www.instagram.com/p/eqdOq2JLeK/,https://scontent.cdninstagram.com/t51.2885-15/...,612.0,612.0,Normal,1380069141,25-09-2013 00:32:21,0.875568,263042348,...,5.666667,6.0,6.0,6.666667,7.0,,3.0,5.666667,73.0,public
3,552382455733335946_263042348,https://www.instagram.com/p/eqdOq2JLeK/,https://scontent.cdninstagram.com/t51.2885-15/...,612.0,612.0,Normal,1380069141,25-09-2013 00:32:21,0.875568,263042348,...,5.666667,6.0,6.0,6.666667,7.0,,3.0,5.666667,73.0,public
4,552382455733335946_263042348,https://www.instagram.com/p/eqdOq2JLeK/,https://scontent.cdninstagram.com/t51.2885-15/...,612.0,612.0,Normal,1380069141,25-09-2013 00:32:21,0.875568,263042348,...,5.666667,6.0,6.0,6.666667,7.0,,3.0,5.666667,73.0,public


In [8]:
def addRatioValues(newdf, olddf, a, b, c):
    newdf[c] = olddf[a] / olddf[b]
    return newdf

# Function that copies values from some arbitrary column "a" to new dataframe.
def copyColumnValues(newdf, olddf, a):
    newdf[a] = olddf[a]
    return newdf

# Function that add mean values of arbitrary columns "a" and "b" to new
# dataframe (f.a.: c = (a + b) / 2).
def addMeanValues(newdf, olddf, a, b, c):
    newdf[c] = (olddf[a] + olddf[b]) / 2
    return newdf

# Function that takes a string of income and transforms it to its averae
def income_from_string(string):
    new = ''.join([c for c in string if (c.isdigit() or c=='$')])
    values = [int(i) for i in new.split('$') if len(i) > 0]
    try:
        average = sum(values)/len(values)
    except:
        average = 999999
    return average

# apply function of income_from_string
def income_transform(newdf, total_df):
    newdf['income'] = total_df['income'].apply(income_from_string)
    return newdf

def one_hot_encode (newdf, totaldf, column, drop_first=False):
    newdf = pd.concat([newdf, pd.get_dummies(totaldf[column], drop_first=drop_first)], axis=1)
    return newdf

# Function to transfer the end and start time of the questionnaire into a duration. Output is addidition to newdf
def duration_questionnaire(newdf, totaldf):
    totaldf['end_q'] = pd.to_datetime(totaldf['end_q'])
    totaldf['start_q'] = pd.to_datetime(totaldf['start_q'])
    newdf['dur_quest'] = totaldf['end_q'] - totaldf['start_q']
    return newdf

In [82]:
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

def duration_questionnaire(newdf, total_df):
    total_df['end_q'] = pd.to_datetime(total_df['end_q'])
    total_df['start_q'] = pd.to_datetime(total_df['start_q'])
    newdf['dur_quest'] = (total_df['end_q'] - total_df['start_q']).dt.total_seconds()
    return newdf

# Function to convert birthyear into age
def born_to_age (newdf, total_df):
    newdf['age'] = - (total_df['born'] - 2019)
    return newdf

# Function that performs linear regression on a given dataframe and returns coefficients and R-squared
def linear_regression(total_df, y_under):
    rus = RandomUnderSampler( random_state=0)
    #total_df['y'] = y
    total_df['y_under'] = y_under
    total_df = total_df.dropna(axis=0)
    print(len(total_df['y_under'].value_counts()))
    total_df = total_df.groupby("y_under").filter(lambda x: len(x) >= 40)
    #print(y_under.value_counts())
    y_under = total_df['y_under']
    print(len(total_df['y_under'].value_counts()))
    X = total_df.drop(columns=['y_under'])
    print(X.shape)
    brop = X.columns    
    X , y_under = rus.fit_resample(X, y_under)
    datas = pd.DataFrame(X, columns=brop)
    datas['y_under'] = y_under
    print(datas.shape)
    #y = total_df['y']
    y = datas['PERMA']
    X = datas.drop(columns=['PERMA'])
    scaler = preprocessing.RobustScaler()
    X_fitted = scaler.fit(X)
    X_formed = X_fitted.transform(X)
    #scaler = RobustScaler()
    #fitted_X = scaler.fit(X)
    #X_prep = preprocess.transform(X)
    X_train = X_formed[:2080] 
    X_test = X_formed[2080:]
    y_train = y[:2080]
    y_test = y[2080:]
    lr = Ridge().fit(X_train, y_train)
    pred_y = lr.predict(X_test)
    pred_y_train = lr.predict(X_train)
    
    print('Variance score: %.2f' % r2_score(y_test, pred_y))
    print('Variance score: %.2f' % r2_score(y_train, pred_y_train))
    print('Coefficients:\n', lr.coef_)
    return

In [83]:
def getCompleteDF():
    #Read the individual data frames
    anp_df = pd.read_pickle(r'anp.pickle').sort_values('emotion_score', ascending=False).drop_duplicates(['image_id'])
    face_df = pd.read_pickle(r'face.pickle').sort_values('emo_confidence', ascending=False).drop_duplicates(['image_id'])
    image_df = pd.read_pickle(r'image_data.pickle')
    metrics_df = pd.read_pickle(r'image_metrics.pickle').sort_values('like_count', ascending=False).drop_duplicates(['image_id'])
    object_labels_df = pd.read_pickle(r'object_labels.pickle').sort_values('data_amz_label_confidence', ascending=False).drop_duplicates(['image_id'])
    survey_df = pd.read_pickle(r'survey.pickle')

    # Merge them based on the image_id so that we have a large data frame containing all the elements
    image_anp_frame = pd.merge(image_df, anp_df, how='inner', on='image_id')
    im_anp_obj_frame = pd.merge(image_anp_frame, object_labels_df, how='inner', on='image_id')
    im_anp_obj_face_frame = pd.merge(im_anp_obj_frame, face_df, how='inner', on='image_id')
    im_anp_obj_face_metrics_frame = pd.merge(im_anp_obj_face_frame, metrics_df, how='inner', on='image_id')
    survey_df['insta_user_id'] = pd.to_numeric(survey_df['insta_user_id'])
    im_anp_obj_face_metrics_frame['user_id'] =  pd.to_numeric(im_anp_obj_face_metrics_frame['user_id'])
    total_df = pd.merge(im_anp_obj_face_metrics_frame, survey_df, how='inner', left_on='user_id', right_on='insta_user_id')

    return total_df

In [84]:
# Function that returns dataframe consisting of just the useful columns.
def getUsefulColumnsDF(total_df):
    newdf = pd.DataFrame()
    newdf = addRatioValues(newdf, total_df, 'image_height', 'image_width', 'image_ratio')
    newdf = addRatioValues(newdf, total_df, 'user_followed_by', 'user_follows', 'popularity')
    newdf = addMeanValues(newdf, total_df, 'face_age_range_high', 'face_age_range_low', 'face_age_mean')
    newdf = duration_questionnaire(newdf, total_df)
    newdf = one_hot_encode(newdf, total_df, column='image_filter')
    newdf = one_hot_encode(newdf, total_df, column='face_gender', drop_first=True)
    newdf = one_hot_encode(newdf, total_df, column='education')
    newdf = one_hot_encode(newdf, total_df, column='employed')
    newdf = one_hot_encode(newdf, total_df, column='gender', drop_first=True)
    newdf = one_hot_encode(newdf, total_df, column='participate', drop_first=True)
    newdf = income_transform(newdf, total_df)
    newdf = copyColumnValues(newdf, total_df, 'data_memorability')
    newdf = copyColumnValues(newdf, total_df, 'user_followed_by')
    newdf = copyColumnValues(newdf, total_df, 'user_follows')
    newdf = copyColumnValues(newdf, total_df, 'user_posted_photos')
    newdf = copyColumnValues(newdf, total_df, 'comment_count')
    newdf = copyColumnValues(newdf, total_df, 'like_count')
    newdf = copyColumnValues(newdf, total_df, 'PERMA')
    #newdf = copyColumnValues(newdf, total_df, 'HAP')
    newdf = copyColumnValues(newdf, total_df, 'imagecount')
    newdf = duration_questionnaire(newdf, total_df)
    newdf = income_transform(newdf, total_df)
    newdf = born_to_age(newdf, total_df)
    print (newdf.columns)
    return newdf

In [85]:
# Main function.
def main():
    total_df = getCompleteDF()
    #print(total_df['PERMA'])
    usable_df = getUsefulColumnsDF(total_df)
    linear_regression(usable_df, total_df['user_id'])
    #print(len(total_df.columns))
    #print(len(usable_df.columns))
    

if __name__ == "__main__":
    main()

Index(['image_ratio', 'popularity', 'face_age_mean', 'dur_quest', '1977',
       'Aden', 'Amaro', 'Ashby', 'Brannan', 'Brooklyn', 'Charmes', 'Clarendon',
       'Crema', 'Dogpatch', 'Earlybird', 'Gingham', 'Ginza', 'Gotham', 'Hefe',
       'Helena', 'Hudson', 'Inkwell', 'Juno', 'Kelvin', 'Lark', 'Lo-fi',
       'Ludwig', 'Maven', 'Mayfair', 'Moon', 'Nashville', 'Normal', 'Perpetua',
       'Poprocket', 'Reyes', 'Rise', 'Sierra', 'Skyline', 'Slumber', 'Stinson',
       'Sutro', 'Toaster', 'Unknown', 'Valencia', 'Vesper', 'Walden', 'Willow',
       'X-Pro II', 'Male', 'College graduate', 'High school graduate',
       'Post graduate degree', 'Some high school', 'A homemaker', 'A student',
       'Employed for wages', 'Out of work and looking for work',
       'Out of work but not currently looking for work', 'Retired',
       'Self-employed', 'Unable to work', 'Male', 'Yes', 'income',
       'data_memorability', 'user_followed_by', 'user_follows',
       'user_posted_photos', 'comment_co

In [39]:
x = np.array([[1, 2, 3,4,5], [4, 5, 6,7,8], [3,4,5,6,7], [3,4,9,6,7], [3,8,7,6,7]], np.int32)

In [40]:
#total_df[total_df['user_id'].value_counts() > 300]

x

array([[1, 2, 3, 4, 5],
       [4, 5, 6, 7, 8],
       [3, 4, 5, 6, 7],
       [3, 4, 9, 6, 7],
       [3, 8, 7, 6, 7]])

In [41]:
dfx = pd.DataFrame(data=x, columns=list('abcde'))

In [42]:
dfx

Unnamed: 0,a,b,c,d,e
0,1,2,3,4,5
1,4,5,6,7,8
2,3,4,5,6,7
3,3,4,9,6,7
4,3,8,7,6,7


In [43]:
dfx['b'][:2]

0    2
1    5
Name: b, dtype: int32

In [24]:
total_df['user_id'].value_counts()

25469443      468138
703978203     251142
249861555     186711
287562303     183252
13745951      177900
31736205      162732
246095675     147732
372088523     143292
50853245      140217
30837828      133578
1619510       132468
452851338     123825
143763900     114666
288335200     111135
289794729      99885
6734387        99798
34069800       98859
276232195      97140
22180590       90132
1600397470     73863
143854846      72627
3069744        63990
235671446      63477
265063047      62574
48972978       60198
55281515       57939
33420910       55050
183823541      40908
52590715       31761
3988856        31371
               ...  
898090810       1020
49846561         960
526051197        930
3092060835       825
316760531        750
345836709        660
3417740025       645
2143580844       630
2486800367       615
2003630999       600
246535583        495
187539125        462
1950544520       420
192872688        360
4185084          345
571630184        270
4239188708   

## Training linear model

Steps to follow:
- Data Cleaning
- Selecting relevant features
- Feature engineering (making features based on other features)

Interesting features:
- Orientation (image_height/image_width)
- Image filter (one-hot encoding or classification according to media)
- Data memorability
- User_bio
- User_follows, User_followed_by
- Popularity (user_followed_by / user_follows)
- user_posted_photos
- anp_sentiment, label (beredening voor ordinaliteit nagaan?)
- emotion_score, label (beredenering voor ordinaliteit nagaan?)
- data_amz_label, data_amz_confidence (alles boven x% confidence, onderverdelen in categoriën)
- Face_gender (gewogen voor confidence)
- face_age_mean (face_age_range_high + face_age_range_low / 2)
- face_sunglasses, face_beard, face_mustache, ...., emo_confidence
- comment_count
- like_count
- gender 
- born
- education
- employed
- income (cleaning nodig)
- HAP
- participate
- end_q - start_q 
- imagecount
- private_account boeit niet, want heeft maar 1 waarde

Options:
- Faces (minder samples, maar meer parameters), op basis van data_amz_label
- Alle (meer samples, maar niet alle parameters)
- Perma score valideren





