## Read Data

In [68]:
import numpy as np
import pandas as pd
import scipy.stats
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("once")
dtype=np.int64

In [69]:
anp = pd.read_pickle("data/anp.pickle")
celebrity = pd.read_pickle("data/celebrity.pickle")
face = pd.read_pickle("data/face.pickle")
image_data = pd.read_pickle("data/image_data.pickle")
image_metrics= pd.read_pickle("data/image_metrics.pickle")
object_labels = pd.read_pickle("data/object_labels.pickle")
survey = pd.read_pickle("data/survey.pickle")

In [70]:
##keep unique users (drop duplicates of survey) and correct values
survey.rename(columns={"insta_user_id": "user_id"}, inplace=True)
#remove most recent duplicates
survey = survey.drop(survey.index[113]) 
survey = survey.drop(survey.index[122])  

#correct scores
survey['PERMA'] = survey[['P_1','P_2','P_3','E_1','E_2','E_3','R_1','R_2','R_3','M_1','M_2','M_3','A_1','A_2','A_3']].mean(axis=1)
survey.rename(columns={'PERMA_corrected':'PERMA'}, inplace=True)

In [71]:
survey.to_pickle("data/FINAL/survey_final.pkl")

In [72]:
def printLengths(a):
    for i in a:
        print(len(i))
printLengths([anp, celebrity, face, image_data, image_metrics, object_labels, survey])

325941
2737
86877
41206
44218
172613
159


In [73]:
#Add user_id to image dataframes
survey['user_id'] = survey['user_id'].astype(int)
image_data['user_id'] = image_data['user_id'].astype(int)

def add_user_id(df):
    df['user_id'] = df['image_id'].map(lambda x: x.split('_')[1])
    df['user_id'] = df['user_id'].astype(int)

add_user_id(anp)
add_user_id(celebrity)
add_user_id(face)   
add_user_id(image_metrics)   
add_user_id(object_labels)

# Filter dataframes for non-survey related elements


In [74]:
def filter_by_common_feature(dfA, dfB, feature):
    filtered_df = dfA[dfA[feature].isin(dfB[feature].unique())]
    return filtered_df

filtered_image_data = filter_by_common_feature(image_data, survey,  'user_id')
filtered_image_metrics = filter_by_common_feature(image_metrics, filtered_image_data, 'user_id')
filtered_face = filter_by_common_feature(face, filtered_image_data, 'user_id')
filtered_object_labels = filter_by_common_feature(object_labels, filtered_image_data, 'user_id')
filtered_celebrity = filter_by_common_feature(celebrity, filtered_image_data, 'user_id')
filtered_anp = filter_by_common_feature(anp, filtered_image_data, 'user_id')

printLengths([filtered_anp, filtered_celebrity, filtered_face,
              filtered_image_data, filtered_image_metrics, filtered_object_labels])

filtered_image_data.head()

325369
2736
86784
41206
44131
172271


Unnamed: 0,image_id,image_link,image_url,image_height,image_width,image_filter,image_posted_time_unix,image_posted_time,data_memorability,user_id,user_full_name,user_name,user_website,user_profile_pic,user_bio,user_followed_by,user_follows,user_posted_photos
0,1316962883971761394_3468175004,https://www.instagram.com/p/BJGysPxgsTy/,https://scontent.cdninstagram.com/t51.2885-15/...,640.0,640.0,Lo-fi,1471214231,14-08-2016 22:37:11,0.800521,3468175004,Leah Jenkins,leah.chelle,,https://scontent.cdninstagram.com/t51.2885-19/...,,7.0,0.0,1.0
1,552382455733335946_263042348,https://www.instagram.com/p/eqdOq2JLeK/,https://scontent.cdninstagram.com/t51.2885-15/...,612.0,612.0,Normal,1380069141,25-09-2013 00:32:21,0.875568,263042348,Taylor Degruise,taylordegruise,,https://scontent.cdninstagram.com/t51.2885-19/...,,316.0,347.0,73.0
2,594552614686078174_263042348,https://www.instagram.com/p/hARnP2pLTe/,https://scontent.cdninstagram.com/t51.2885-15/...,640.0,640.0,Vesper,1385096216,22-11-2013 04:56:56,0.672679,263042348,Taylor Degruise,taylordegruise,,https://scontent.cdninstagram.com/t51.2885-19/...,,316.0,347.0,73.0
3,553884883234370621_263042348,https://www.instagram.com/p/evy13fpLQ9/,https://scontent.cdninstagram.com/t51.2885-15/...,640.0,640.0,Amaro,1380248245,27-09-2013 02:17:25,0.843525,263042348,Taylor Degruise,taylordegruise,,https://scontent.cdninstagram.com/t51.2885-19/...,,316.0,347.0,73.0
4,725551583154452417_263042348,https://www.instagram.com/p/oRrVIcJLfB/,https://scontent.cdninstagram.com/t51.2885-15/...,640.0,640.0,Amaro,1400712510,21-05-2014 22:48:30,0.859796,263042348,Taylor Degruise,taylordegruise,,https://scontent.cdninstagram.com/t51.2885-19/...,,316.0,347.0,73.0


## ANP


In [75]:
# #Condense anp dataframe to 5 elements  per image_id

# def clean_anp(df):
#     df['freq'] = df.groupby('image_id')['image_id'].transform('count')
#     df = df.drop_duplicates(subset = ['image_id', 'anp_label','anp_sentiment', 'emotion_score', 'freq'])
#     df = df.drop(['freq'], axis = 1)
#     return df

# filtered_anp = clean_anp(filtered_anp)
# filtered_anp.emotion_label.unique()

# filtered_anp

In [76]:
filtered_anp = filter_by_common_feature(anp, filtered_image_data, 'user_id')
filtered_anp

Unnamed: 0,image_id,anp_label,anp_sentiment,emotion_score,emotion_label,user_id
0,951727030670259635_143763900,hot_boys,0.017,0.1760,amazement,143763900
1,951727030670259635_143763900,young_couple,0.019,0.2113,joy,143763900
2,951727030670259635_143763900,dirty_laundry,-0.263,0.0929,joy,143763900
3,951727030670259635_143763900,global_mall,-0.031,0.1304,interest,143763900
4,951728575726873168_289794729,high_boots,0.025,0.1394,amazement,289794729
5,951728575726873168_289794729,funny_pets,0.078,0.1924,joy,289794729
6,951728575726873168_289794729,slow_motion,-0.064,0.1141,interest,289794729
7,951728575726873168_289794729,funny_dog,0.156,0.2859,joy,289794729
8,951728575726873168_289794729,working_group,0.008,0.1234,amazement,289794729
9,951731159501174327_143763900,old_friends,0.543,0.1270,joy,143763900


In [77]:
#Tuples with the 4 categories
Aggressiveness = ('rage', 'anger', 'annoyance', 'apprehension', 'fear', 'terror')
Attention = ('vigilance', 'anticipation', 'interest', 'distraction', 'surprise', 'amazement')
Pleasantness = ('ecstasy', 'joy', 'serenity', 'pensiveness', 'sadness', 'grief')
Appreciation = ('wadmiration', 'trust', 'acceptance', 'boredom', 'disgust', 'loathing')

#Function for adding scores
def score_anp_sentiments(row, tuple):
    score = (3, 2, 1, -1, -2, -3)
    if row == tuple[0]:
        val = score[0]
    elif row == tuple[1]:
        val = score[1]
    elif row == tuple[2]:
        val = score[2]
    elif row == tuple[3]:
        val = score[3]
    elif row == tuple[4]:
        val = score[4]
    elif row == tuple[5]:
        val = score[5]
    else:
        val = float('nan')
    return val

filtered_anp['Aggressiveness'] = filtered_anp.apply(lambda row : score_anp_sentiments(row['emotion_label'], Aggressiveness), axis=1)
filtered_anp['Attention'] = filtered_anp.apply(lambda row : score_anp_sentiments(row['emotion_label'], Attention), axis=1)
filtered_anp['Pleasantness'] = filtered_anp.apply(lambda row : score_anp_sentiments(row['emotion_label'], Pleasantness), axis=1)
filtered_anp['Appreciation'] = filtered_anp.apply(lambda row : score_anp_sentiments(row['emotion_label'], Appreciation), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [78]:
new_df = filtered_anp.loc[filtered_anp['image_id']=='983381034037196210_545497348']
new_df

Unnamed: 0,image_id,anp_label,anp_sentiment,emotion_score,emotion_label,user_id,Aggressiveness,Attention,Pleasantness,Appreciation
5660,983381034037196210_545497348,ugly_sweater,-0.418,0.095,sadness,545497348,,,-2.0,
5661,983381034037196210_545497348,final_year,-0.065,0.0952,anger,545497348,2.0,,,
5662,983381034037196210_545497348,final_year,-0.065,0.0952,terror,545497348,-3.0,,,
5663,983381034037196210_545497348,happy_halloween,0.324,0.1656,joy,545497348,,,2.0,
5664,983381034037196210_545497348,triple_trouble,0.008,0.0821,amazement,545497348,,-3.0,,
5665,983381034037196210_545497348,big_family,0.023,0.1949,amazement,545497348,,-3.0,,
64307,983381034037196210_545497348,ugly_sweater,-0.418,0.095,sadness,545497348,,,-2.0,
64308,983381034037196210_545497348,final_year,-0.065,0.0952,anger,545497348,2.0,,,
64309,983381034037196210_545497348,final_year,-0.065,0.0952,terror,545497348,-3.0,,,
64310,983381034037196210_545497348,happy_halloween,0.324,0.1656,joy,545497348,,,2.0,


In [79]:
new_df = new_df.groupby('user_id').agg(np.mean).reset_index()
new_df

Unnamed: 0,user_id,anp_sentiment,emotion_score,Aggressiveness,Attention,Pleasantness,Appreciation
0,545497348,-0.032167,0.121333,-0.5,-3.0,0.0,


In [80]:
# Groupby user_id
anp_final = filtered_anp.groupby('user_id').agg(np.mean).reset_index()
anp_final = anp_final.drop(['anp_sentiment'], axis = 1)
anp_final

Unnamed: 0,user_id,emotion_score,Aggressiveness,Attention,Pleasantness,Appreciation
0,1619510,0.166728,0.395437,-2.123223,0.509868,0.616667
1,2829661,0.161615,0.759036,-1.795058,0.358543,0.571429
2,3069744,0.158810,0.127490,-2.021838,0.317474,0.544304
3,3988856,0.151934,0.860566,-2.073277,0.386861,0.154362
4,4185084,0.160690,-0.366667,-1.949821,-0.601504,1.166667
5,6734387,0.179962,0.450593,-2.141340,0.552548,0.683168
6,7410944,0.168647,0.067416,-2.052246,0.370939,1.538462
7,8854187,0.157788,0.677551,-1.792145,0.075148,0.503937
8,9931123,0.168627,0.000000,-2.928571,1.400000,
9,11520833,0.155128,0.813433,-1.892460,0.258225,0.716814


In [81]:
# anp_final = anp_final.fillna(anp_final.mean())
anp_final = anp_final.fillna(0.0)
anp_final.to_pickle('data/FINAL/anp_final_new.pkl')
#add PERMA scores
anp_PERMA = pd.merge(anp_final, survey[['user_id','PERMA']],on='user_id', how='left')

In [82]:
anp_PERMA = pd.merge(anp, survey[['user_id','PERMA']],on='user_id', how='left')

# MERGING FINALIZED DATAFRAMES


## Importing

In [83]:
## Michael 
object_labels_survey_final = pd.read_pickle('data/FINAL/object_labels_final.pkl')

## Ben
image_metrics_final = pd.read_pickle('data/FINAL/image_metrics_final.pkl')
image_metrics_final.columns = ['like_count', 'comment_count']
image_metrics_final =  image_metrics_final.reset_index()
image_metrics_final['user_id'] = image_metrics_final['user_id'].astype(int)

## Lino
face_df_final = pd.read_pickle('data/FINAL/face_df_final.pkl')

In [84]:
## Dimitris
image_data_final = filtered_image_data[['user_id', 'user_followed_by', 'user_follows', 'user_posted_photos']]
image_data_final.to_pickle('data/FINAL/image_data_final.pkl')

# imagedata_anp_final = pd.merge(image_data_final, anp_final, how='left', on='user_id')
# imagedata_anp_final = image_anp_PERMA.groupby('user_id').agg(np.mean).reset_index()

In [85]:
image_data_final.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41206 entries, 0 to 41205
Data columns (total 4 columns):
user_id               41206 non-null int64
user_followed_by      41206 non-null float64
user_follows          41206 non-null float64
user_posted_photos    41206 non-null float64
dtypes: float64(3), int64(1)
memory usage: 1.6 MB


## Merging

In [86]:
#merge anp and image_data
image_data_final = filtered_image_data[['user_id', 'user_followed_by', 'user_follows', 'user_posted_photos']]
imagedata_anp_final = pd.merge(image_data_final, anp_final, how='left', on='user_id')
imagedata_anp_final = image_anp_PERMA.groupby('user_id').agg(np.mean).reset_index()

#merge face
imagedata_anp_face_final = pd.merge(imagedata_anp_final, face_df_final, how='left', on='user_id')

#merge image_metrics
imagedata_anp_face_metrics_final = pd.merge(imagedata_anp_face_final, image_metrics_final, how='left', on='user_id')

#merge object labels
final_df = pd.merge(imagedata_anp_face_metrics_final, object_labels_survey_final, how='left', on='user_id')

# image_anp_object_labels_survey_final = pd.merge(image_anp_final, object_labels_survey_final, how='left', on='user_id')

NameError: name 'image_anp_PERMA' is not defined

In [None]:
import seaborn as sns

corr = anp_PERMA.corr()

plt.rcParams['figure.figsize'] = 10,8
fig = plt.figure()
sns.heatmap(corr, 
            xticklabels=corr.columns.values,
            yticklabels=corr.columns.values,
            cmap="viridis")
plt.show()
# fig.savefig('correlation.png')

# s = abs(corr.unstack())
# so = s.sort_values(kind="quicksort", ascending=False)
# so.head(25)



## ANP  Regression analysis for feature selection 

In [None]:
%%time
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score
import statsmodels.formula.api as sm
import math

In [None]:
import statsmodels.api as sm
from scipy import stats

def run_multi_linear_regression (features_list , dataframe):
    X = dataframe[features_list]
    y = dataframe['PERMA']
    
    X_train, X_test,  y_train, y_test = train_test_split(X,y,test_size=0.2, random_state = 42)

#     print(X_train.shape)
#     print(X_test.shape)
#     print(y_train.shape)
#     print(y_test.shape)
    lm = LinearRegression()
    crossval_score = cross_val_score(lm, X_train, y_train, cv=5, scoring = 'neg_mean_absolute_error')
    lm.fit(X_train, y_train)
    
    X2 = sm.add_constant(X_train)
    est = sm.OLS(y_train, X2)
    est2 = est.fit()
    print(est2.summary())
    
    #Make prediction
    y_pred = lm.predict(X_test)
    #The intercept
    print('Intercept: \n', lm.intercept_)
    #The coefficients
    print('Coefficients: \n', lm.coef_)
#     The mean squared error
    print("Mean squared error: %.2f"% mean_squared_error(y_test, y_pred))
#     The root mean squared error
    print("Root mean squared error: %.2f"% math.sqrt(mean_squared_error(y_test, y_pred)))
#     Explained variance score: 1 is perfect prediction
    print('R squared: %.2f' % r2_score(y_test, y_pred))
    results_dict = {'features':features_list,
                    'MSE': mean_squared_error(y_test, y_pred),
                   'R_sq': r2_score(y_test, y_pred),
                    'cross_val': [crossval_score.mean(), crossval_score.std()]}
                   
    
    return results_dict    

#     sns.set(rc=None)
#     plt.scatter(X_test, y_test,  color='black')
#     plt.plot(X_test, y_pred, color='blue', linewidth=3)
#     plt.xlabel('Mean %s score' %features)
#     plt.ylabel("PERMA Score")
#     plt.title("Plot of %s" %features)
#     plt.xticks(())

#     plt.yticks(())

In [None]:
# step-up feature selection

features = list(anp_PERMA)
features.remove('PERMA')
features.remove('user_id')
features_for_regr = features
print(features)
# features_for_regr.remove('Valuation')
# features_for_regr.remove('anp_sentiment')
features_for_regr.remove('emotion_score')
run_multi_linear_regression(features_for_regr.copy(), anp_PERMA)

# ##STEP 1
# results = []
# for feature in features:
#     print('feature')
#     feature = [feature]
#     results.append(run_multi_linear_regression(feature , anp_PERMA))
# # for feature in features:
# df = pd.DataFrame(results)


# selected_features = []
# best_feat = df['features'][df['R_sq'].idxmax()]
# features = [e for e in features if e not in best_feat]
# print(best_feat)
# selected_features.extend(best_feat)
# print(selected_features)

# ##STEP 2
# results = []
# for feature in features:
#     features_for_regr = selected_features
#     features_for_regr.append(feature)
#     print('features_for_regr')
#     results.append(run_multi_linear_regression(features_for_regr.copy(), anp_PERMA))
#     features_for_regr.remove(feature)
# df = pd.DataFrame(results)

# df
# # ##STEP 3

# # results = []
# # for feature in features:
# #     feature = [feature]
# #     features_for_reg = selected_features
# #     features_for_reg.extend(feature)
# #     run_multi_linear_regression(features_for_reg, anp_PERMA)
# #     features_for_reg.remove(feature[0])

In [None]:
image_data_PERMA = filtered_image_data[['user_id', 'user_followed_by', 'user_follows', 'user_posted_photos']]
# image_data_PERMA = pd.merge(image_data_PERMA, survey[['user_id','PERMA']],on='user_id', how='left')
# image_data_PERMA.head()

image_anp_PERMA = pd.merge(image_data_PERMA, anp_PERMA, how='left', on='user_id')
image_anp_PERMA.info()
image_anp_PERMA = image_anp_PERMA.groupby('user_id').agg(np.mean).reset_index()
image_anp_PERMA

In [None]:
features = list(image_anp_PERMA)
features.remove('PERMA')
features.remove('user_id')

features.remove('emotion_score')
features.remove('Valuation')
features.remove('user_followed_by')
features.remove('anp_sentiment')
features.remove('Attention')
features.remove('user_follows')

features.remove('Pleasantness')
# features_for_regr.remove('anp_sentiment')
# features_for_regr.remove('emotion_score')
# run_multi_linear_regression(features.copy(), image_anp_PERMA)

In [None]:
from sklearn.svm import SVR
import matplotlib.pyplot as plt

# def run_SVM(features_list, dataframe):
#     X = dataframe[features_list]
#     y = dataframe['PERMA']

#     # #############################################################################
#     # Add noise to targets
#     y[::5] += 3 * (0.5 - np.random.rand(8))

#     # #############################################################################
#     # Fit regression model
#     svr_rbf = SVR(kernel='rbf', C=1e3, gamma=0.1)
#     svr_lin = SVR(kernel='linear', C=1e3)
#     svr_poly = SVR(kernel='poly', C=1e3, degree=2)
#     y_rbf = svr_rbf.fit(X, y).predict(X)
#     y_lin = svr_lin.fit(X, y).predict(X)
#     y_poly = svr_poly.fit(X, y).predict(X)
    
#     lw = 2
#     plt.scatter(X, y, color='darkorange', label='data')
#     plt.plot(X, y_rbf, color='navy', lw=lw, label='RBF model')
#     plt.plot(X, y_lin, color='c', lw=lw, label='Linear model')
#     plt.plot(X, y_poly, color='cornflowerblue', lw=lw, label='Polynomial model')
#     plt.xlabel('data')
#     plt.ylabel('target')
#     plt.title('Support Vector Regression')
#     plt.legend()
#     plt.show()


In [None]:

features = list(image_anp_PERMA)
features.remove('PERMA')
features.remove('user_id')

# features.remove('emotion_score')
# features.remove('Valuation')
# features.remove('user_followed_by')
# features.remove('anp_sentiment')
# features.remove('Attention')
# features.remove('user_follows')
X = image_anp_PERMA[features]
y = image_anp_PERMA['PERMA']

    
X_train, X_test,  y_train, y_test = train_test_split(X,y,test_size=0.2, random_state = 42)

#     print(X_train.shape)
#     print(X_test.shape)
#     print(y_train.shape)
#     print(y_test.shape)
# lm.fit(X_train, y_train)
# #############################################################################
# Add noise to targets
# y[::5] += 3 * (0.5 - np.random.rand(8))

# #############################################################################
# Fit regression model

# svr_lin = SVR(kernel='linear', C=1e3)
# print('lin')
# svr_poly = SVR(kernel='poly', C=1e3, degree=2)
# print('poly')
svr_rbf = SVR(kernel='linear', C = 500)
print('rbf')
model = svr_rbf.fit(X_train, y_train)

y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

# # #     The mean squared error
# print("Mean squared error: %.2f"% mean_squared_error(y_test, y_rbf))
# #     The root mean squared error
print("Root mean squared error: %.2f"% math.sqrt(mean_squared_error(y_train, y_pred_train)))
print("Root mean squared error: %.2f"% math.sqrt(mean_squared_error(y_test, y_pred_test)))



# #     Explained variance score: 1 is perfect prediction
# print('R squared: %.2f' % r2_score(y_test, y_rbf))
# results_dict = {'MSE': mean_squared_error(y_test, y_rbf),
#                'R_sq': r2_score(y_test, y_rbf)}


# y_lin = svr_lin.fit(X_train, y_train).predict(X_test)
# y_poly = svr_poly.fit(X_train, y_train).predict(X_test)

# lw = 2
# plt.scatter(X, y, color='darkorange', label='data')
# plt.plot(X, y_rbf, color='navy', lw=lw, label='RBF model')
# plt.plot(X, y_lin, color='c', lw=lw, label='Linear model')
# plt.plot(X, y_poly, color='cornflowerblue', lw=lw, label='Polynomial model')
# plt.xlabel('data')
# plt.ylabel('target')
# plt.title('Support Vector Regression')
# plt.legend()
# plt.show()
# features.remove('Pleasantness')
# run_SVM(features.copy(), image_anp_PERMA)

y_rbf

In [None]:
# %%time
# ##Merge dataframes into one big one
# image_survey = pd.merge(survey, filtered_image_data, how='inner', on='user_id')
# image_anp_frame = pd.merge(image_survey, filtered_anp, how='inner', on='image_id')
# image_anp_metrics_frame = pd.merge(image_anp_frame, filtered_image_metrics, how='inner', on='image_id')
# image_anp_metrics_labels_frame = pd.merge(image_anp_metrics_frame, filtered_object_labels, how='inner', on='image_id')
# image_anp_metrics_labels_face_frame = pd.merge(image_anp_metrics_labels_frame, filtered_face, how='inner', on='image_id')
# complete_df = pd.merge(image_anp_metrics_labels_face_frame, filtered_celebrity, how='inner', on='image_id')


In [None]:
# dropped_features= ['image_url', 'image_height', 'image_width', 'image_posted_time_unix',
#                    'image_posted_time',     
#     ]

# len(survey_filtered['insta_user_id'].unique())

In [None]:
# %%time
# from scipy.stats import spearmanr
# spearman = scipy.stats.spearmanr(complete_df)

In [None]:
# import seaborn as sns
# plt.rcParams['figure.figsize'] = 10,8
# fig = plt.figure()
# sns.heatmap(corr, 
#             xticklabels=corr.columns.values,
#             yticklabels=corr.columns.values,
#             cmap="viridis")
# plt.show()
## fig.savefig('correlation.png')

# s = abs(corr.unstack())
# so = s.sort_values(kind="quicksort", ascending=False)

In [None]:
df = filtered_anp
df.head()

In [None]:
image_metrics.head()

In [None]:
used = filtered_anp.loc[filtered_anp['image_id']=='951727030670259635_143763900']

In [None]:
final_object_labels = filtered_object_labels.groupby('image_id').first().reset_index().drop(['data_amz_label_confidence'], axis=1)

filtered_object_labels

In [None]:
image_metrics['user_id'] = image_metrics['image_id'].map(lambda x: x.split('_')[1])
f_set = {'like_count':['mean','median'], 'comment_count':['mean','median']}
new_df = image_metrics.groupby('user_id').agg(f_set)
new_df.info()
new_df.head()

In [None]:

new_df = filtered_anp
new_df = pd.get_dummies(new_df, columns=['emotion_label'], drop_first=True)
new_df.head()

In [None]:
def most_freq(pdseries):
    return pdseries.mode()

f_set = {'anp_sentiment':np.mean, 'emotion_score':np.mean, 'emotion_label':most_freq}
new_df = filtered_anp.groupby('user_id').agg(f_set)
new_df.columns = ['anp_sentiment', 'emotion_score', 'emotion_label']
new_df.emotion_label.value_counts()

In [None]:
['emotion_score'].corr(new_df['anp_sentiment'])


In [None]:
certain_df = used['emotion_label']

most_freq(certain_df)

In [None]:

new_df = filtered_object_labels.groupby('image_id').first().reset_index().drop(['data_amz_label_confidence', 'index'], axis=1)

In [None]:
len(new_df['data_amz_label'].value_counts())

In [None]:
anp

In [90]:
face.groupby('user_id')['face_id']

<pandas.core.groupby.groupby.SeriesGroupBy object at 0x7ffab261b7f0>