## Assignment 2: Well-being from Instagram data


In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [5]:
#Read the individual data frames
anp_df = pd.read_pickle(r'Data/anp.pickle') 
face_df = pd.read_pickle(r'Data/face.pickle')
image_df = pd.read_pickle(r'Data/image_data.pickle')
metrics_df = pd.read_pickle(r'Data/image_metrics.pickle')
object_labels_df = pd.read_pickle(r'Data/object_labels.pickle')
survey_df = pd.read_pickle(r'Data/survey.pickle')

In [9]:
# Merge them based on the image_id so that we have a large data frame containing all the elements
image_anp_frame = pd.merge(image_df, anp_df, how='inner', on='image_id')
im_anp_obj_frame = pd.merge(image_anp_frame, object_labels_df, how='inner', on='image_id')
im_anp_obj_face_frame = pd.merge(im_anp_obj_frame, face_df, how='inner', on='image_id')
im_anp_obj_face_metrics_frame = pd.merge(im_anp_obj_face_frame, metrics_df, how='inner', on='image_id')
survey_df['insta_user_id'] = pd.to_numeric(survey_df['insta_user_id'])
im_anp_obj_face_metrics_frame['user_id'] =  pd.to_numeric(im_anp_obj_face_metrics_frame['user_id'])
total_df = pd.merge(im_anp_obj_face_metrics_frame, survey_df, how='left', left_on='user_id', right_on='insta_user_id')

In [14]:
total_df.head()

Unnamed: 0,image_id,image_link,image_url,image_height,image_width,image_filter,image_posted_time_unix,image_posted_time,data_memorability,user_id,...,P,E,R,M,A,PERMA,N_EMO,P_EMO,imagecount,private_account
0,552382455733335946_263042348,https://www.instagram.com/p/eqdOq2JLeK/,https://scontent.cdninstagram.com/t51.2885-15/...,612.0,612.0,Normal,1380069141,25-09-2013 00:32:21,0.875568,263042348,...,5.666667,6.0,6.0,6.666667,7.0,,3.0,5.666667,73.0,public
1,552382455733335946_263042348,https://www.instagram.com/p/eqdOq2JLeK/,https://scontent.cdninstagram.com/t51.2885-15/...,612.0,612.0,Normal,1380069141,25-09-2013 00:32:21,0.875568,263042348,...,5.666667,6.0,6.0,6.666667,7.0,,3.0,5.666667,73.0,public
2,552382455733335946_263042348,https://www.instagram.com/p/eqdOq2JLeK/,https://scontent.cdninstagram.com/t51.2885-15/...,612.0,612.0,Normal,1380069141,25-09-2013 00:32:21,0.875568,263042348,...,5.666667,6.0,6.0,6.666667,7.0,,3.0,5.666667,73.0,public
3,552382455733335946_263042348,https://www.instagram.com/p/eqdOq2JLeK/,https://scontent.cdninstagram.com/t51.2885-15/...,612.0,612.0,Normal,1380069141,25-09-2013 00:32:21,0.875568,263042348,...,5.666667,6.0,6.0,6.666667,7.0,,3.0,5.666667,73.0,public
4,552382455733335946_263042348,https://www.instagram.com/p/eqdOq2JLeK/,https://scontent.cdninstagram.com/t51.2885-15/...,612.0,612.0,Normal,1380069141,25-09-2013 00:32:21,0.875568,263042348,...,5.666667,6.0,6.0,6.666667,7.0,,3.0,5.666667,73.0,public


In [40]:
total_df.columns

Index(['image_id', 'image_link', 'image_url', 'image_height', 'image_width',
       'image_filter', 'image_posted_time_unix', 'image_posted_time',
       'data_memorability', 'user_id', 'user_full_name', 'user_name',
       'user_website', 'user_profile_pic', 'user_bio', 'user_followed_by',
       'user_follows', 'user_posted_photos', 'anp_label', 'anp_sentiment',
       'emotion_score', 'emotion_label', 'data_amz_label',
       'data_amz_label_confidence', 'face_id', 'face_gender',
       'face_gender_confidence', 'face_age_range_high', 'face_age_range_low',
       'face_sunglasses', 'face_beard', 'face_beard_confidence',
       'face_mustache', 'face_mustache_confidence', 'face_smile',
       'face_smile_confidence', 'eyeglasses', 'eyeglasses_confidence',
       'face_emo', 'emo_confidence', 'comment_count',
       'comment_count_time_created', 'like_count', 'like_count_time_created',
       'index', 'id', 'gender', 'born', 'education', 'employed', 'income',
       'A_2', 'N_1', 'P_1

In [73]:
total_df['private_account'].unique()

array(['public'], dtype=object)

## Training linear model

Steps to follow:
- Data Cleaning
- Selecting relevant features
- Feature engineering (making features based on other features)

Interesting features:
- Orientation (image_height/image_width)
- Image filter (one-hot encoding or classification according to media)
- Data memorability
- User_bio
- User_follows, User_followed_by
- Popularity (user_followed_by / user_follows)
- user_posted_photos
- anp_sentiment, label (beredening voor ordinaliteit nagaan?)
- emotion_score, label (beredenering voor ordinaliteit nagaan?)
- data_amz_label, data_amz_confidence (alles boven x% confidence, onderverdelen in categoriën)
- Face_gender (gewogen voor confidence)
- face_age_mean (face_age_range_high + face_age_range_low / 2)
- face_sunglasses, face_beard, face_mustache, ...., emo_confidence
- comment_count
- like_count
- gender 
- born
- education
- employed
- income (cleaning nodig)
- HAP
- participate
- end_q - start_q 
- imagecount
- private_account boeit niet, want heeft maar 1 waarde

Options:
- Faces (minder samples, maar meer parameters), op basis van data_amz_label
- Alle (meer samples, maar niet alle parameters)
- Perma score valideren







In [152]:
# Iedere functie heeft als input total_df en new_df en returnt new_df
def duration_questionnaire(new_df, total_df):
    total_df['end_q'] = pd.to_datetime(total_df['end_q'])
    total_df['start_q'] = pd.to_datetime(total_df['start_q'])
    new_df['dur_quest'] = (total_df['end_q'] - total_df['start_q']).dt.total_seconds()
    
    return new_df

def one_hot_encode (new_df, total_df, column = 'image_filter'):
    new_df = pd.concat([new_df, pd.get_dummies(total_df['image_filter'])], axis=1)
    return new_df

def born_to_age (total_df, newdf):
    newdf['age'] = - (total_df['born'] - 2019)
    return newdf


array([26, 39, 41, 27, 22, 56, 25, 23, 45, 42, 32, 31, 40, 34, 33, 24, 28,
       44, 35, 38, 49, 29, 37, 47, 30, 36, 59, 43, 50, 48, 55, 60, 61, 21],
      dtype=int64)

In [189]:
newdf = pd.DataFrame()
newdf = duration_questionnaire(new_df, total_df)
newdf = one_hot_encode(new_df, total_df)

# total_df['end_q']

In [146]:
def linear_regression(total_df, y):
    total_df['y'] = y
    total_df = total_df.dropna(axis=0)
    y = total_df['y']
    X = total_df.drop(columns=['y'])
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    lr = LinearRegression().fit(X_train, y_train)
    pred_y = lr.predict(X_test)
    print('Variance score: %.2f' % r2_score(y_test, pred_y))
    print('Coefficients:\n', lr.coef_)
    return

linear_regression(newdf, total_df['PERMA'])

       dur_quest  1977  Aden  Amaro  Ashby  Brannan  Brooklyn  Charmes  \
25980       85.0     0     0      0      0        0         0        0   
25981       85.0     0     0      0      0        0         0        0   
25982       85.0     0     0      0      0        0         0        0   
25983       85.0     0     0      0      0        0         0        0   
25984       85.0     0     0      0      0        0         0        0   

       Clarendon  Crema  ...  Slumber  Stinson  Sutro  Toaster  Unknown  \
25980          0      0  ...        0        0      0        0        0   
25981          0      0  ...        0        0      0        0        0   
25982          0      0  ...        0        0      0        0        0   
25983          0      0  ...        0        0      0        0        0   
25984          0      0  ...        0        0      0        0        0   

       Valencia  Vesper  Walden  Willow  X-Pro II  
25980         0       0       0       0         0  


In [156]:
def create_correlation_matrix(newdf):
    return

In [196]:
test = total_df.iloc[7]['income']

def income_from_string(string):
    new = ''.join([c for c in string if (c.isdigit() or c=='$')])
    values = [int(i) for i in new.split('$') if len(i) > 0]
    try:
        average = sum(values)/len(values)
    except:
        average = 999999
    return average

def income_transform(newdf, total_df):
    newdf['income'] = total_df['income'].apply(income_from_string)
    return newdf

In [200]:
newdf = income_transform(newdf, total_df)
list(newdf['income'].unique())

[24999.5,
 34999.5,
 94999.5,
 44999.5,
 14999.5,
 150000.0,
 54999.5,
 84999.5,
 10000.0,
 124999.5,
 74999.5,
 999999.0,
 64999.5]