In [44]:
import pandas as pd
from embeddings import get_embedding
import json
import numpy as np
from scipy import spatial
from scipy import stats
import plotly.graph_objects as go
import plotly.express as px
import sklearn
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

In [45]:
df = pd.read_csv('../../data/text_response_embeddings.csv')

# Defining Social Media

#### Definition presented to parents in survey:

“Social media” is a broad term that refers to online platforms and apps that enable users to create profiles, interact with others, and share content, such as information, ideas, images, videos, and personal messages. These platforms allow individuals or groups to build communities, engage in discussions, and connect across different networks. Examples of social media include platforms like Facebook, Instagram, TikTok, Twitter (X), Snapchat, YouTube, and others. These platforms can be used for both personal and professional purposes, with features such as posting, commenting, messaging, and sharing content."


In [46]:
given_definition = "'Social media' is a broad term that refers to online platforms and apps that enable users to create profiles, interact with others, and share content, such as information, ideas, images, videos, and personal messages. These platforms allow individuals or groups to build communities, engage in discussions, and connect across different networks. Examples of social media include platforms like Facebook, Instagram, TikTok, Twitter (X), Snapchat, YouTube, and others. These platforms can be used for both personal and professional purposes, with features such as posting, commenting, messaging, and sharing content."
embedded_definition = get_embedding(given_definition)

'Social media' is a broad term that refers to online platforms and apps that enable users to create profiles, interact with others, and share content, such as information, ideas, images, videos, and personal messages. These platforms allow individuals or groups to build communities, engage in discussions, and connect across different networks. Examples of social media include platforms like Facebook, Instagram, TikTok, Twitter (X), Snapchat, YouTube, and others. These platforms can be used for both personal and professional purposes, with features such as posting, commenting, messaging, and sharing content.


In [47]:
q1_embeddings = df['q1_embedding'].to_list()
q1_embeddings = [json.loads(x) for x in q1_embeddings]

### Agreement with definition and each other

In [48]:
def get_similarity(embedding1, embedding2):
    return 1 - spatial.distance.cosine(embedding1, embedding2)

In [49]:
# cos sim to definition
df['sim_to_def'] = [get_similarity(x, embedded_definition) for x in q1_embeddings]
# mean cos sim to other responses
mean_sims = []
for x in q1_embeddings:
    others = [e for e in q1_embeddings if e != x]
    similarities = [get_similarity(x, e) for e in others]
    mean_sim = np.mean(similarities)
    mean_sims.append(mean_sim)
df['mean_sim'] = mean_sims


#### fig 1

In [72]:
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=df['sim_to_def'],
    y=df['mean_sim'],
    mode='markers',
    text=df['sm_def'],
    hoverinfo='all',
    marker=dict(
        size=5,
        color='cornflowerblue',
        line=dict(width=1.5)
    ),
))

fig.update_layout(
    title='Cosine Similarity to Definition vs. Mean Cosine Similarity to Other Responses',
    xaxis_title='Similarity to Given Definition',
    yaxis_title='Mean Similarity to Others',
    showlegend=False,
    width=800,
    height=700,
    xaxis=dict(range=[0, 1]),
    yaxis=dict(range=[0, 1]),
)
# spearman correlation
s_s, s_p = stats.spearmanr(df['sim_to_def'], df['mean_sim'], alternative='greater')
# pearson correlation
p_s, p_p = stats.pearsonr(df['sim_to_def'], df['mean_sim'], alternative='greater')

fig.add_annotation(
    xref='paper',
    yref='paper',
    x=0.5,
    y=0.1,
    text=f'Spearman correlation: {s_s:.2f} (p-value: {s_p:.2e})<br>Pearson correlation: {p_s:.2f} (p-value: {p_p:.2e})',
    showarrow=False,
    font=dict(size=12, color='black'),
    bgcolor='rgba(255, 255, 255, 0.8)',
)

fig.add_vline(
    x=0.25,
    line_width=2,
    line_dash='dash',
    opacity=0.5,
    annotation_text='Similarity score to <br>definition < 0.25',
    annotation_position="left",
              annotation=dict(font_size=10, font_color ='orangered'),
              line_color='orangered')


fig.write_image('../../figures/similarity_to_def_vs_mean_similarity.png')
fig.write_html('../../figures/similarity_to_def_vs_mean_similarity.html')

fig.show()

### Parent Social Media Use

In [51]:
# add data about which platforms are used by the parent
sm_use = ['q2_25___1', 'q2_25___2', 'q2_25___3', 'q2_25___4', 'q2_25___5', 'q2_25___6', 'q2_25___7', 'q2_25___8', 'q2_25___9', 'q2_25___10', 'q2_25___11', 'q2_25___12', 'q2_25___13', 'q2_25___14', 'q2_25___15', 'q2_25___16', 'q2_25___17', 'q2_25___18']
platforms_df = pd.read_csv('../../data/PPTOB_Data_03.21.csv')[['record_id'] + sm_use + ['q2_25___19']]

platforms_df['n_platforms'] = platforms_df[sm_use].sum(axis=1)

# merge with the main df
df = df.merge(platforms_df, on='record_id', how='left')

#### social media data dictionary
<b>Use:</b> <br/>
q2_25___1 - q2_25___19 <br/>
"Please indicate which platform(s) YOU have an account on or use."<br/>
1, Discord | 2, Facebook or Facebook Messenger | 3, Instagram | 4, iMessage / SMS Text Messaging | 5, Messenger Kids | 6, Pinterest | 7, Reddit | 8, Roblox | 9, Signal | 10, Snapchat | 11, Telegram | 12, TikTok | 13, Tumblr | 14, Twitch | 15, Twitter | 16, WhatsApp | 17, YouTube | 18, Other | 19, I do not have a social media account or use social media.

<b>Ratings:</b> <br/>
1, IS Social Media | 2, Is NOT Social Media | 3, Unfamiliar With Platform


<b>Social Media Labels: </b> <br/>
Ranking of if each platform is “social media:<br/>
(Tied) Twitter, Tiktok, Snapchat, Reddit, Linkedin, Tumblr, Twitch, Pinterest, BeReal, Facebook, Instagram  (14 points)<br/>
Youtube (13 points)<br/>
Discord (12 points)<br/>
Roblox, Video Games (11 points)<br/>
Telegram (8 points)<br/>
Whatsapp, Messenger Kids, Zoom, Signal, Online Shopping Sites (7 points)

In [52]:
# rename columns on social media ratings - parents indicated whether or not they believe each is a form of social media
platforms = ['bereal',
       'discord', 'fb', 'text', 'instagram', 'linkedin', 'messenger_kids',
       'shopping', 'pinterest', 'reddit', 'roblox', 'signal', 'snapchat',
       'telegram', 'tiktok', 'tumblr', 'twitch', 'games', 'whatsapp',
       'twitter', 'youtube', 'zoom']
for sm in platforms:
    df.rename(
        columns={
            sm: f'{sm}_rating'
        },
        inplace=True
    )
sm_ratings = [x + '_rating' for x in platforms]

# rename columns on social media usage - parents indicated whether or not they use each platform
df.rename(columns={
    'q2_25___1' :'discord_use',
    'q2_25___2' : 'fb_use',
    'q2_25___3' : 'instagram_use',
    'q2_25___4' : 'text_use',
    'q2_25___5' : 'messenger_kids_use',
    'q2_25___6' : 'pinterest_use',
    'q2_25___7' : 'reddit_use',
    'q2_25___8' : 'roblox_use',
    'q2_25___9' : 'signal_use',
    'q2_25___10' : 'snapchat_use',
    'q2_25___11' : 'telegram_use',
    'q2_25___12' : 'tiktok_use',
    'q2_25___13' : 'tumblr_use',
    'q2_25___14' : 'twitch_use',
    'q2_25___15' : 'twitter_use',
    'q2_25___16' : 'whatsapp_use',
    'q2_25___17' : 'youtube_use',
    'q2_25___18' : 'other_use',
    'q2_25___19' : 'none',
},inplace=True,
)


In [53]:
# social media categorization to use for scoring parents' responses
sm_rankings = {
    'twitter': 14,
    'tiktok': 14,
    'snapchat': 14,
    'reddit': 14,
    'linkedin': 14,
    'tumblr': 14,
    'twitch': 14,
    'pinterest': 14,
    'bereal': 14,
    'fb': 14,
    'instagram': 14,
    'youtube': 13,
    'discord': 12,
    'roblox': 11,
    'games': 11,
    'telegram': 11,
    'whatsapp': 7,
    'messenger_kids': 7,
    'zoom': 7,
    'signal': 7,
    'shopping': 7,
    'text': 3
}

In [54]:
# accuracy of parent's ratings:
def get_accuracy(df:pd.DataFrame, score_threshold:int):
    for p in platforms:
        platform_rating = df[f'{p}_rating']
        platform_ranking = sm_rankings[p]
        if platform_ranking >= score_threshold:
            df[f'{p}_accuracy'] = np.where(platform_rating == 1, 1, 0)
        if platform_ranking < score_threshold:
            df[f'{p}_accuracy'] = np.where(platform_rating == 2, 1, 0)
    # total accuracy for parent
    df['total_accuracy'] = df[[f'{p}_accuracy' for p in platforms]].sum(axis=1)/22

for t in [7, 11, 12, 13, 14]:
    print('threshold:' + str(t))
    get_accuracy(df, t)
    print('accuracy:' + str(round(np.mean(df['total_accuracy']),3)))
    print('-----------------------------')


threshold:7
accuracy:0.633
-----------------------------
threshold:11
accuracy:0.666
-----------------------------
threshold:12
accuracy:0.668
-----------------------------
threshold:13
accuracy:0.643
-----------------------------
threshold:14
accuracy:0.616
-----------------------------


In [55]:
# set final values for accuracy
get_accuracy(df, 11)

#### fig 2

In [56]:
fig = px.box(
    df,
    x='n_platforms',
    y='sim_to_def',
    title='Cosine Similarity of Response to Given Definition by Number of Platforms Used',
    points='all'
)

fig.update_traces(
    # jitter=.7,
    marker=dict(
        color='rgba(50, 100, 255, 1)',
        line=dict(width=2)))

fig.update_layout(
    showlegend=False,
)

# add regression line
X = df.n_platforms.values.reshape(-1, 1)

model = sklearn.linear_model.LinearRegression()
model.fit(X, df.sim_to_def)

x_range = np.linspace(X.min(), X.max())
y_range = model.predict(x_range.reshape(-1, 1))

# fig.add_trace(go.Scatter(
#     x=x_range,
#     y=y_range,
#     name='Regression Fit',
#     marker=dict(
#         color='rgb(116, 77, 145)',
#     ),

# ),
# )


# spearman correlation
s_s, s_p = stats.spearmanr(df['n_platforms'], df['sim_to_def'], alternative='greater')
# pearson correlation
p_s, p_p = stats.pearsonr(df['n_platforms'], df['sim_to_def'], alternative='greater')

fig.add_annotation(
    xref='paper',
    yref='paper',
    x=0.98,
    y=0.1,
    text=f'Spearman correlation: {s_s:.2f} (p-value: {s_p:.2f})<br>Pearson correlation: {p_s:.2f} (p-value: {p_p:.2f})',
    showarrow=False,
    font=dict(size=12, color='black'),
    bgcolor='rgba(255, 255, 255, 0.8)',
)

fig.update_layout(
    xaxis_title='Number of Platforms Used',
    yaxis_title='Similarity',
    width=1000,
    height=600,
    showlegend=True, 
    legend =dict(
        orientation="h",
        yanchor="bottom",
        x=.85
        )
    )

fig.write_image('../../figures/similarity_to_def_vs_platforms_used.png')

fig.show()

#### with buckets of social media use

In [None]:
df['sm_use_level'] = df['n_platforms'].apply(lambda x: 'None' if x < 1 else '1-3' if 0<x<4 else '4-6' if 3 < x <= 6 else '7-9' if 6<x<10 else '10-12' if 9<x<13 else '13+' if x>12 else 0)

fig = go.Figure()

fig.add_trace(go.Box(
    x=df['sm_use_level'],
    y=df['sim_to_def'],
    boxpoints='all',
    showlegend=False,
))

fig.update_layout(
    xaxis_title='Social Media Platforms Used',
    yaxis_title='Cosine Similarity',
    width=1000,
    height=600,
    showlegend=True, 
    legend =dict(
        orientation="h",
        yanchor="bottom",
        x=.85
        ),
    title='Cosine Similarity of Response to Given Definition by Number of Platforms Used',
    )
fig.update_xaxes(type= 'category', categoryorder = 'array', categoryarray= ['None', '1-3', '4-6', '7-9', '10-12', '13+'])

fig.update_traces(
    # jitter=.7,
    marker=dict(
        color='rgba(50, 100, 255, 1)',
        line=dict(width=1)))

fig.write_image('../../figures/bucketed_similarity_to_def_vs_platforms_used.png')

fig.show()

print(stats.kruskal(
    df[(df['sm_use_level'] == 'None')]['sim_to_def'],
    df[(df['sm_use_level'] == '1-3')]['sim_to_def'],
    df[(df['sm_use_level'] == '4-6')]['sim_to_def'],
    df[(df['sm_use_level'] == '7-9')]['sim_to_def'],
    df[(df['sm_use_level'] == '10-12')]['sim_to_def'],
    df[(df['sm_use_level'] == '13+')]['sim_to_def']
))

KruskalResult(statistic=np.float64(3.862899492383576), pvalue=np.float64(0.5693192592370716))


In [83]:

fig = go.Figure()

fig.add_trace(go.Box(
    x=df[df['total_accuracy']>0.25]['sm_use_level'],
    y=df[df['total_accuracy']>0.25]['total_accuracy'],
    boxpoints='all',
    showlegend=False,
))


fig.update_xaxes(type= 'category', categoryorder = 'array', categoryarray= ['None', '1-3', '4-6', '7-9', '10-12', '13+'])

fig.update_traces(
    # jitter=.7,
    marker=dict(
        color='rgba(50, 100, 255, 1)',
        line=dict(width=1)))

In [86]:
print(stats.kruskal(
    df[(df['sm_use_level'] == 'None') & (df['total_accuracy']>0.4)]['total_accuracy'],
    df[(df['sm_use_level'] == '1-3') & (df['total_accuracy']>0.4)]['total_accuracy'],
    df[(df['sm_use_level'] == '4-6') & (df['total_accuracy']>0.4)]['total_accuracy'],
    df[(df['sm_use_level'] == '7-9') & (df['total_accuracy']>0.4)]['total_accuracy'],
    df[(df['sm_use_level'] == '10-12') & (df['total_accuracy']>0.4)]['total_accuracy'],
    df[(df['sm_use_level'] == '13+') & (df['total_accuracy']>0.4)]['total_accuracy']
))

KruskalResult(statistic=np.float64(29.70467511580446), pvalue=np.float64(1.685973627884288e-05))


### fig 3

In [58]:
fig = go.Figure()
plot_platforms = [p for p in platforms if p + '_use' in df.columns and p + '_accuracy' in df.columns]
accs_used = []
accs_not_used = []
pct_rated_social_media_used = []
pct_rated_social_media_not_used = []
for p in plot_platforms:
    acc_used = df.query(f'{p}_use == 1')[f'{p}_accuracy'].mean()
    accs_used.append(acc_used)
    acc_not_used = df.query(f'{p}_use == 0')[f'{p}_accuracy'].mean()
    accs_not_used.append(acc_not_used)

    n = df.query(f'{p}_use == 1 and {p}_rating == 1').shape[0]/df.query(f'{p}_use == 1').shape[0]
    pct_rated_social_media_used.append(n)
    n = df.query(f'{p}_use == 0 and {p}_rating == 1').shape[0]/df.query(f'{p}_use == 0').shape[0]
    pct_rated_social_media_not_used.append(n)

fig.add_trace(go.Bar(
    y=plot_platforms,
    x=pct_rated_social_media_used,
    orientation='h',
    name='Used by Parent',
    marker_color='rgba(0, 0, 120, 0.8)',
))


fig.add_trace(go.Bar(
    y=plot_platforms,
    x=pct_rated_social_media_not_used,
    orientation='h',
    name='Not Used',
    marker_color='rgba(130, 168, 186, 0.8)',
))
fig.update_layout(
    barmode='group',
    width=800,
    height=800,
    title='Percentage of Parents Who Rated Each Platform as Social Media by Personal Usage',
    xaxis_title='Parents Who Rated as Social Media',
    yaxis_title='Platform',
    xaxis=dict(
        range=[0, 1],
    ),
    legend=dict(
        orientation="h",
        x=0.6,
        y=-0.08,
    ),
)

fig.update_traces(
    marker_line_color='rgb(8,48,107)',
    marker_line_width=1.5,
)

fig.write_image('../../figures/platform_rating_by_use.png')

fig.show()

### fig 4

In [76]:
# fig = go.Figure()
# fig.add_trace(go.Scatter(
#     x = df['total_accuracy'],
#     y = df['sim_to_def'],
#     mode='markers',
#     marker=dict(
#         size=10,
#         color='rgba(0, 0, 255, 0.5)',
#         line=dict(width=2)
#     ),
#     showlegend=False,
# ))

fig = px.strip(
    df,
    x = 'total_accuracy',
    y = 'sim_to_def',  
)

fig.update_traces(
    # jitter=.7,
    marker=dict(
        color='cornflowerblue',
        line=dict(width=2)))

# add regression line
X = df.total_accuracy.values.reshape(-1, 1)

model = sklearn.linear_model.LinearRegression()
model.fit(X, df.sim_to_def)

x_range = np.linspace(X.min(), X.max())
y_range = model.predict(x_range.reshape(-1, 1))

# spearman correlation
s_s, s_p = stats.spearmanr(df['total_accuracy'], df['sim_to_def'])
# pearson correlation
p_s, p_p = stats.pearsonr(df['total_accuracy'], df['sim_to_def'])

fig.add_annotation(
    xref='paper',
    yref='paper',
    x=0.04,
    y=0.1,
    text=f'Spearman correlation: {s_s:.2f} (p-value: {s_p:.2f})<br>Pearson correlation: {p_s:.2f} (p-value: {p_p:.2f})',
    showarrow=False,
    font=dict(size=12, color='black'),
    bgcolor='rgba(255, 255, 255, 0.8)',
)

fig.add_trace(go.Scatter(
    x=x_range,
    y=y_range,
    name='Regression Fit',
    marker=dict(
        color='rgb(116, 77, 145)',
    ),
    

),
)

fig.update_layout(
    xaxis_title='Accuracy of Social Media Identification',
    yaxis_title='Similarity',
    width=1000,
    height=600,
    legend =dict(
        orientation="h",
        yanchor="bottom",
        x=.85
        ),
    title = 'Accuracy of Social Media Identification vs. Cosine Similarity of Text Response to Definition',
    )

fig.write_image('../../figures/accuracy_similarity.png')

fig.show()

In [80]:
fig = px.strip(
    df,
    y = 'total_accuracy',
    x = 'sim_to_def',  
)

fig.update_traces(
    # jitter=.7,
    marker=dict(
        color='cornflowerblue',
        line=dict(width=2)))

# add regression line
X = df.sim_to_def.values.reshape(-1, 1)

model = sklearn.linear_model.LinearRegression()
model.fit(X, df.total_accuracy)

x_range = np.linspace(X.min(), X.max())
y_range = model.predict(x_range.reshape(-1, 1))

# spearman correlation
s_s, s_p = stats.spearmanr(df['sim_to_def'], df['total_accuracy'])
# pearson correlation
p_s, p_p = stats.pearsonr(df['sim_to_def'], df['total_accuracy'])

fig.add_annotation(
    xref='paper',
    yref='paper',
    x=0.04,
    y=0.1,
    text=f'Spearman correlation: {s_s:.2f} (p-value: {s_p:.2f})<br>Pearson correlation: {p_s:.2f} (p-value: {p_p:.2f})',
    showarrow=False,
    font=dict(size=12, color='black'),
    bgcolor='rgba(255, 255, 255, 0.8)',
)

fig.add_trace(go.Scatter(
    x=x_range,
    y=y_range,
    name='Regression Fit',
    marker=dict(
        color='rgb(116, 77, 145)',
    ),
    

),
)

# fig.update_layout(
#     xaxis_title='Accuracy of Social Media Identification',
#     yaxis_title='Similarity',
#     width=1000,
#     height=600,
#     legend =dict(
#         orientation="h",
#         yanchor="bottom",
#         x=.85
#         ),
#     title = 'Accuracy of Social Media Identification vs. Cosine Similarity of Text Response to Definition',
#     )

fig.write_image('../../figures/accuracy_similarity.png')

fig.show()

## Dimensionality Reduction

### t-SNE

In [None]:

tsne = TSNE(n_components=2, perplexity=15, random_state=42, init='random', learning_rate=200)
vis_dims = tsne.fit_transform(X)
tsne_x = [x for x, y in vis_dims]
tsne_y = [y for x,y in vis_dims]

# colors = [str(x) for x in df['survey_num'].to_list()]
# color_map = {1: 'coral', 2: 'mediumseagreen', 3: 'cornflowerblue', 4: 'orchid'}

fig = go.Figure()
fig.add_trace(go.Scatter(
    x=tsne_x,
    y=tsne_y,
    mode='markers',
    hoverinfo='all',
    text=df['sm_def'].to_list(),
    marker=dict(
        size=5,
        color="cornflowerblue",
        line=dict(width=2),

    ),
))

fig.update_layout(
    title='t-SNE of Embeddings of Parents\' Social Media Definitions',
    width=900,
    height=850,
)


fig.write_html('../../figures/q1_tsne.html')

fig.write_image('../../figures/q1_tsne.png')

fig.show()


### PCA

In [60]:
np.array(q1_embeddings).shape

(1002, 1536)

In [61]:
X = np.array(q1_embeddings)
pca = PCA(n_components=3)
components = pca.fit_transform(X)
exp_var = pca.explained_variance_ratio_.sum()

df[['c1', 'c2', 'c3']] = components

In [62]:
df['scatter_color'] = np.where((df['c2']>.4), 'khaki', np.where(df['c3']<-0.4, 'lightgreen',np.where(df['c1']>0, 'cornflowerblue','mediumblue')))

fig = go.Figure()
fig.add_trace(go.Scatter3d(
    x=components[:, 0],
    y=components[:, 1],
    z=components[:, 2],
    mode='markers',
    marker_color = df['scatter_color'],
    showlegend=False,
))

fig.update_traces(
    marker=dict(
        size=3,
        line=dict(width=1),
        opacity=0.85))

# fig.add_trace(go.Scatter3d(
#     x = [-.05], y = [.55], z = [-.05],
#     text = 'test',
#     mode = 'text',
#     textfont=dict(
#         size=20,
#         color='goldenrod',
#         weight='bold'),
# ))

# fig.add_trace(go.Scatter3d(
#     x = [-.2], y = [0], z = [-.1],
#     text = f'formal <br>definition',
#     mode = 'text',
#     textfont=dict(
#         size=20,
#         color='midnightblue',
#         weight='bold'),
# ))

fig.add_trace(go.Scatter3d(
    x=[None], y=[None], z=[None],
    name = 'Formal definition',
    mode='markers',
    marker=dict(size=10, color="midnightblue",)
))

fig.add_trace(go.Scatter3d(
    x=[None], y=[None], z=[None],
    name = 'Informal definition',
    mode='markers',
    marker=dict(size=10, color="cornflowerblue",)
))

fig.add_trace(go.Scatter3d(
    x=[None], y=[None], z=[None],
    name = 'Pros and cons',
    mode='markers',
    marker=dict(size=10, color="khaki",)
))

fig.add_trace(go.Scatter3d(
    x=[None], y=[None], z=[None],
    name = 'List of platforms',
    mode='markers',
    marker=dict(size=10, color="lightgreen",)
))

fig.update_layout(
    title="3D PCA of Embeddings of Parents' Social Media Definitions",
    scene=dict(
        xaxis_title='PC1',
        yaxis_title='PC2',
        zaxis_title='PC3',
        aspectmode='cube'
    ),
    width=800,
    height=800,
    legend=dict(
        orientation='h',
        y=1.02,
        entrywidth=110,
    ),
)

fig.add_annotation(
    xref='paper',
    yref='paper',
    showarrow=False,
    x=0.5,
    y=1.05,
    text='Visually identified clusters',
    font=dict(size=14, color='black'),
)
fig.show()

fig.write_html('../../figures/q1_PCA.html')

fig.write_image('../../figures/q1_PCA.png')


In [63]:
df.query('scatter_color =="darkblue"')['sm_def'].values

array([], dtype=object)

In [64]:
df.query('scatter_color =="cornflowerblue"')['sm_def'].values

array(['A place for people to share and communicate everyday things',
       'Online access to friends and sharing with others',
       'A platform where one can express their thoughts and about your life with friends and with people you do not know.',
       'Any number of apps or sites that allow people to connect online. Either through personal accounts (i.e. Facebook/Instagram) or through videos and gaming platforms (i.e. TikTok).',
       'A platform people use to connect with friends and popular people they like.',
       'Just using online social networks like X or Reddit.',
       'A PLACE WHERE YOU MEET NEW PERSON AND EDUCATIVE',
       'online abilities where you can interact with other people or react to videos of other people.',
       "Any app that you can post on or view other people's posts.",
       'A place to exchange of information, ideas, and personal experiences.',
       'Programs where you can communicate with others. You can also watch videos and learn new subje

In [65]:
df.query('c2>.5 and c3>-0.02')['sm_def'].values

array(['used for good purposes it can be very useful, but for children of a certain age it is completely unnecessary',
       'Social media is a great tool and it also can be very destructive without proper supervision and controls. I like social media for the good and bad that it provides because I try to use the good and bad for teaching moments with my child.',
       'its a great influence to our children both positively and negatively',
       'for one it can be bad for the kid if she is on to long. two bad things can be learned. it doesnt teach the kid nothing.',
       'It can be a great learning tool. It can also cause a lot of problems.',
       "Social media is has both good and bad sides. It can be beneficial and educational. And it can also have a negative impact on a child's self esteem as well as giving them a false sense of reality.",
       "It can be a good thing and bad thing. There are some educational and informative things on there. But there are also things on the

In [66]:
df.query('c3<-0.4')['sm_def'].values

array(['Facebook, twitter, tiktok', 'apps like instagram',
       'Facebook, Instagram, X', 'Facebook, instagram, TikTok, YouTube',
       'Apps like tick tock or facebook. watching videos on youtube or other platforms like youtube.',
       'Anything like Facebook or Instagram. Snapchat or the like.',
       'Facebook, TikTok, anything app wise on a smart device.',
       'Social media is using the different apps available, such as tictok, facebook, youtube, instagram.',
       'Apps like Youtube, instagram, Tiktok.',
       'Instagram, Facebook, Tiktok, Youtube',
       'Apps like snap chat and instagram',
       'Twitter, tiktok, facebook, pinterest, instagram, snapchat',
       'YouTube, tik tok instagram',
       'ALLOWING ON APPS SUCH AS FACEBOOK, SNAPCHAT AND INSTAGRAM.',
       'Any type of social website/app such as facebook, discord, snapchat, X (Twitter), instagram,',
       'Facebook, TikTok, all the dating apps, twitter (X), Intagram, WhatsApp, etc.',
       'Instagram, Fa

In [67]:
pca = PCA()
pca.fit(X)

explained_variance = pca.explained_variance_ratio_.sum()
exp_var_cumul = np.cumsum(pca.explained_variance_ratio_)
print(f"Explained variance: {explained_variance:.2f}")

pca.explained_variance_ratio_.shape

fig = go.Figure()
fig.add_trace(go.Scatter(
    x=list(range(1, len(pca.explained_variance_ratio_) + 1)),
    y=exp_var_cumul,
    mode='lines+markers',
    name='Explained Variance Ratio',
    marker=dict(size=4, color='blue'),
    line=dict(width=2, color='lightblue'),
    fill='tozeroy',
))

fig.update_xaxes(range=[0,500])

Explained variance: 1.00
