In [51]:
import pandas as pd
from embeddings import get_embedding
import json
import numpy as np
from scipy import spatial
from scipy import stats
import plotly.graph_objects as go
import plotly.express as px
import sklearn

In [3]:
df = pd.read_csv('../../data/text_response_embeddings.csv')

#### Definition presented to parents in survey:

“Social media” is a broad term that refers to online platforms and apps that enable users to create profiles, interact with others, and share content, such as information, ideas, images, videos, and personal messages. These platforms allow individuals or groups to build communities, engage in discussions, and connect across different networks. Examples of social media include platforms like Facebook, Instagram, TikTok, Twitter (X), Snapchat, YouTube, and others. These platforms can be used for both personal and professional purposes, with features such as posting, commenting, messaging, and sharing content."


In [4]:
given_definition = "'Social media' is a broad term that refers to online platforms and apps that enable users to create profiles, interact with others, and share content, such as information, ideas, images, videos, and personal messages. These platforms allow individuals or groups to build communities, engage in discussions, and connect across different networks. Examples of social media include platforms like Facebook, Instagram, TikTok, Twitter (X), Snapchat, YouTube, and others. These platforms can be used for both personal and professional purposes, with features such as posting, commenting, messaging, and sharing content."
embedded_definition = get_embedding(given_definition)

'Social media' is a broad term that refers to online platforms and apps that enable users to create profiles, interact with others, and share content, such as information, ideas, images, videos, and personal messages. These platforms allow individuals or groups to build communities, engage in discussions, and connect across different networks. Examples of social media include platforms like Facebook, Instagram, TikTok, Twitter (X), Snapchat, YouTube, and others. These platforms can be used for both personal and professional purposes, with features such as posting, commenting, messaging, and sharing content.


In [5]:
q1_embeddings = df['q1_embedding'].to_list()
q1_embeddings = [json.loads(x) for x in q1_embeddings]


In [6]:
def get_similarity(embedding1, embedding2):
    return 1 - spatial.distance.cosine(embedding1, embedding2)

In [7]:
df['sim_to_def'] = [get_similarity(x, embedded_definition) for x in q1_embeddings]
mean_sims = []
for x in q1_embeddings:
    others = [e for e in q1_embeddings if e != x]
    similarities = [get_similarity(x, e) for e in others]
    mean_sim = np.mean(similarities)
    mean_sims.append(mean_sim)
df['mean_sim'] = mean_sims


In [8]:
# similarity to each other's responses


In [9]:
df

Unnamed: 0,record_id,survey_num,sm_def,q2_24,q2_23_a,q2_23_b,q2_23_c,q2_23_d,q2_23_e,q2_23_f,...,twitch,games,whatsapp,twitter,youtube,zoom,q1_embedding,q2_embedding,sim_to_def,mean_sim
0,21,1,I would define social media to be consumption ...,I would not want to expose my child for fake n...,4,6,6,6,5,6,...,1,2,2,1,1,2,"[0.017319293692708015, -0.02876683883368969, -...","[0.032916080206632614, -0.024085627868771553, ...",0.644386,0.455379
1,24,1,I would define social media as applications or...,If it's appropriate for their age and also if ...,2,1,1,1,3,3,...,1,2,2,1,1,2,"[0.012307570315897465, -0.010607896372675896, ...","[0.049430962651968, 0.02293068915605545, 0.011...",0.691578,0.527637
2,31,1,A place for people to share and communicate ev...,Language,4,2,4,4,3,3,...,1,2,2,1,1,2,"[0.038517631590366364, -0.04865860193967819, -...","[-0.003726561553776264, -0.021159188821911812,...",0.440532,0.466535
3,33,1,Online access to friends and sharing with others,"If it is made for someone their age- scary, un...",5,4,1,2,6,7,...,3,2,2,1,2,2,"[0.028487812727689743, -0.007498628459870815, ...","[0.040484052151441574, 0.0374775156378746, -0....",0.448627,0.475861
4,37,1,A platform where one can express their thought...,can't think of any right now,7,7,7,7,7,7,...,3,2,2,1,1,2,"[0.028086761012673378, -0.03424948826432228, 0...","[-0.0123531399294734, -0.023728644475340843, 0...",0.446222,0.476290
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
997,1850,4,Social media is about a way to communicate onl...,"The type of content, the impact the content wi...",2,3,2,3,5,2,...,1,2,1,1,1,1,"[0.04264078289270401, -0.03377247229218483, -0...","[0.04271574690937996, 0.00401210505515337, -0....",0.636907,0.541577
998,1857,4,Social media refers to online platforms or too...,none,2,2,2,6,6,2,...,3,2,1,1,1,2,"[0.017521122470498085, 0.0012156404554843903, ...","[0.010587280616164207, 0.0018031198997050524, ...",0.792318,0.552278
999,1868,4,"I think of Facebook, TikTok, Instagram, and wh...",Every kid is different. There are definitely ...,4,1,5,7,3,1,...,1,2,1,1,1,2,"[0.02997586317360401, -0.02118377946317196, -0...","[0.023698655888438225, 0.044606223702430725, -...",0.544948,0.432944
1000,1912,4,"Any of the apps, facebook, instagram, tik tok ...",Whether it serves a purpose or is just brain r...,3,1,1,1,1,1,...,1,2,1,1,1,2,"[-0.0029255130793899298, 0.009177840314805508,...","[-0.013439121656119823, 0.028514310717582703, ...",0.556932,0.452417


In [10]:
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=df['sim_to_def'],
    y=df['mean_sim'],
    mode='markers',
    marker=dict(
        size=10,
        color='rgba(0, 0, 255, 0.5)',
        line=dict(width=2)
    ),
))

fig.update_layout(
    title='Cosine Similarity to Definition vs. Mean Cosine Similarity to Other Responses',
    xaxis_title='Similarity to Given Definition',
    yaxis_title='Mean Similarity to Others',
    showlegend=False,
    width=800,
    height=600,
)
# spearman correlation
s_s, s_p = stats.spearmanr(df['sim_to_def'], df['mean_sim'], alternative='greater')
# pearson correlation
p_s, p_p = stats.pearsonr(df['sim_to_def'], df['mean_sim'], alternative='greater')

fig.add_annotation(
    xref='paper',
    yref='paper',
    x=0.5,
    y=0.1,
    text=f'Spearman correlation: {s_s:.2f} (p-value: {s_p:.2e})<br>Pearson correlation: {p_s:.2f} (p-value: {p_p:.2e})',
    showarrow=False,
    font=dict(size=12, color='black'),
    bgcolor='rgba(255, 255, 255, 0.8)',
)

fig.write_image('../../figures/similarity_to_def_vs_mean_similarity.png')

In [11]:
fig.show()

In [18]:
# add data about which platforms are used by the parent 
platforms_df = pd.read_csv('../../data/PPTOB_Data_03.21.csv')[
    [
        'record_id', 
        'q2_25___1',
        'q2_25___2',
        'q2_25___3',
        'q2_25___4',
        'q2_25___5',
        'q2_25___6',
        'q2_25___7',
        'q2_25___8',
        'q2_25___9',
        'q2_25___10',
        'q2_25___11',
        'q2_25___12',
        'q2_25___13',
        'q2_25___14',
        'q2_25___15',
        'q2_25___16',
        'q2_25___17',
        'q2_25___18',
        'q2_25___19',
        ]
        ]
# 1, Discord | 2, Facebook or Facebook Messenger | 3, Instagram | 4, iMessage / SMS Text Messaging | 5, Messenger Kids | 6, Pinterest | 7, Reddit | 8, Roblox | 9, Signal | 10, Snapchat | 11, Telegram | 12, TikTok | 13, Tumblr | 14, Twitch | 15, Twitter | 16, WhatsApp | 17, YouTube | 18, Other | 19, I do not have a social media account or use social media.
platforms_df['n_platforms'] = platforms_df[
    [
        'q2_25___1',
        'q2_25___2',
        'q2_25___3',
        'q2_25___4',
        'q2_25___5',
        'q2_25___6',
        'q2_25___7',
        'q2_25___8',
        'q2_25___9',
        'q2_25___10',
        'q2_25___11',
        'q2_25___12',
        'q2_25___13',
        'q2_25___14',
        'q2_25___15',
        'q2_25___16',
        'q2_25___17',
        'q2_25___18',
    ]
].sum(axis=1)

In [21]:
df = df.merge(platforms_df[['record_id', 'n_platforms']], on='record_id', how='left')

In [78]:
fig = px.strip(
    df,
    x='n_platforms',
    y='sim_to_def',
    title='Cosine Similarity of Response to Given Definition by Number of Platforms Used',
)

fig.update_traces(marker=dict(
    color='rgba(0, 0, 255, 0.5)',
    line=dict(width=2)))

fig.update_layout(
    xaxis_title='Number of Platforms Used',
    yaxis_title='Similarity',
    showlegend=False,
    width=1000,
    height=600,
)

# add regression line
X = X = df.n_platforms.values.reshape(-1, 1)

model = sklearn.linear_model.LinearRegression()
model.fit(X, df.sim_to_def)

x_range = np.linspace(X.min(), X.max())
y_range = model.predict(x_range.reshape(-1, 1))

fig.add_trace(go.Scatter(
    x=x_range,
    y=y_range,
    name='Regression Fit',
    marker=dict(
        color='rgb(116, 77, 145)',
    ),

),
)


# spearman correlation
s_s, s_p = stats.spearmanr(df['n_platforms'], df['sim_to_def'], alternative='greater')
# pearson correlation
p_s, p_p = stats.pearsonr(df['n_platforms'], df['sim_to_def'], alternative='greater')

fig.add_annotation(
    xref='paper',
    yref='paper',
    x=0.98,
    y=0.1,
    text=f'Spearman correlation: {s_s:.2f} (p-value: {s_p:.2e})<br>Pearson correlation: {p_s:.2f} (p-value: {p_p:.2e})',
    showarrow=False,
    font=dict(size=12, color='black'),
    bgcolor='rgba(255, 255, 255, 0.8)',
)

fig.update_layout(
    showlegend=True, 
    legend =dict(
        orientation="h",
        yanchor="bottom",
        x=.85
        )
    )

fig.write_image('../../figures/similarity_to_def_vs_platforms_used.png')

fig.show()

In [80]:
df[['bereal',
       'discord', 'fb', 'text', 'instagram', 'linkedin', 'messenger_kids',
       'shopping', 'pinterest', 'reddit', 'roblox', 'signal', 'snapchat',
       'telegram', 'tiktok', 'tumblr', 'twitch', 'games', 'whatsapp',
       'twitter', 'youtube', 'zoom']]

Unnamed: 0,bereal,discord,fb,text,instagram,linkedin,messenger_kids,shopping,pinterest,reddit,...,snapchat,telegram,tiktok,tumblr,twitch,games,whatsapp,twitter,youtube,zoom
0,3,2,1,2,1,1,2,2,1,1,...,1,2,1,1,1,2,2,1,1,2
1,3,1,1,2,1,1,1,2,2,1,...,1,2,1,1,1,2,2,1,1,2
2,3,1,1,2,1,2,1,2,2,2,...,1,2,1,1,1,2,2,1,1,2
3,1,2,1,2,1,1,2,2,2,2,...,1,1,1,3,3,2,2,1,2,2
4,1,1,1,2,1,1,2,2,2,1,...,1,3,1,2,3,2,2,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
997,1,1,1,2,1,1,3,1,1,1,...,1,1,1,1,1,2,1,1,1,1
998,3,3,1,2,1,1,1,2,1,1,...,1,1,1,3,3,2,1,1,1,2
999,3,1,1,1,1,1,3,3,1,1,...,1,3,1,1,1,2,1,1,1,2
1000,3,1,1,2,1,1,1,2,1,1,...,1,1,1,1,1,2,1,1,1,2
