In [105]:
import pandas as pd
from sklearn.decomposition import PCA
import json
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
from scipy import spatial
from scipy import stats
from sklearn.manifold import TSNE
from embeddings import get_reduced_embeddings


In [3]:
def get_similarity(embedding1, embedding2):
    return 1 - spatial.distance.cosine(embedding1, embedding2)

In [4]:
df = pd.read_csv('../../data/text_response_embeddings.csv').query('q2_embedding.notnull()').reset_index(drop=True)

q2_embeddings = df['q2_embedding'].tolist()
q2_embeddings = [json.loads(str(x)) for x in q2_embeddings]
df['q2_embedding'] = q2_embeddings

X = np.array(q2_embeddings)

### similarity

In [None]:
def sim_comparison(row, df, dim='all'):
    group = row['survey_num']
    if dim=='all':
        embedding = row['q2_embedding']
        other_embeddings = df.query(f'survey_num != {group}')['q2_embedding'].to_list()
        group_embeddings = df.query(f'survey_num == {group}')['q2_embedding'].to_list()
    if dim == 100:
        embedding = row['reduced_embedding_100']
        other_embeddings = df.query('survey_num != @group')['reduced_embedding_100'].to_list()
        group_embeddings = df.query('survey_num == @group')['reduced_embedding_100'].to_list()
    
    others_sim_scores = [get_similarity(embedding, x) for x in other_embeddings]
    others_mean = np.mean(others_sim_scores)
    group_sim_scores = [get_similarity(embedding, x) for x in group_embeddings]
    group_mean = np.mean(group_sim_scores)
    return others_mean, group_mean


In [6]:
df['others_mean'], df['group_mean'] = zip(*df.apply(lambda row: sim_comparison(row, df, dim='all'), axis=1))


## t-SNE - explore embeddings space

In [162]:
matrix = np.array(q2_embeddings)

tsne = TSNE(n_components=2, perplexity=15, random_state=42, init='random', learning_rate=200)
vis_dims = tsne.fit_transform(matrix)
tsne_x = [x for x, y in vis_dims]
tsne_y = [y for x,y in vis_dims]

colors = [str(x) for x in df['survey_num'].to_list()]
color_map = {1: 'coral', 2: 'mediumseagreen', 3: 'cornflowerblue', 4: 'orchid'}
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=tsne_x,
    y=tsne_y,
    mode='markers',
    hoverinfo='all',
    text=df['q2_24'].to_list(),
    marker=dict(
        size=5,
        color=[color_map[int(x)] for x in colors],
        line_color='black',
        line_width=0.5,
    ),
    showlegend=False,
))


fig.update_layout(
    title='t-SNE of Embeddings of Factors of Appropriateness',
    width=900,
    height=850,
)

fig.add_trace(go.Scatter(
    x=[np.mean([tsne_x[i] for i in df.query('survey_num == 1').index.to_list()])],
    y=[np.mean([tsne_y[i] for i in df.query('survey_num == 1').index.to_list()])],
    name = 'Ages 7-10',
    mode='markers',
    marker=dict(size=12, color="coral",)
))

fig.add_trace(go.Scatter(
    x=[np.mean([tsne_x[i] for i in df.query('survey_num == 2').index.to_list()])],
    y=[np.mean([tsne_y[i] for i in df.query('survey_num == 2').index.to_list()])],
    name = 'Ages 11-14',
    mode='markers',
    marker=dict(size=12, color="mediumseagreen",)
))

fig.add_trace(go.Scatter(
    x=[np.mean([tsne_x[i] for i in df.query('survey_num == 3').index.to_list()])],
    y=[np.mean([tsne_y[i] for i in df.query('survey_num == 3').index.to_list()])],
    name = 'Ages 15-18',
    mode='markers',
    marker=dict(size=12, color="cornflowerblue",)
))

fig.add_trace(go.Scatter(
    x=[np.mean([tsne_x[i] for i in df.query('survey_num == 4').index.to_list()])],
    y=[np.mean([tsne_y[i] for i in df.query('survey_num == 4').index.to_list()])],
    name = 'Ages 19-22',
    mode='markers',
    marker=dict(size=12, color="orchid",)
))


fig.write_html('../../figures/q2_tsne.html')

fig.write_image('../../figures/q2_tsne.png')

fig.show()


#### remove "none" responses

In [164]:
indexes = [tsne_x.index(x) for x in tsne_x if x < - 40]

subset = df.drop(indexes).reset_index(drop=True)
subset_q2_embeddings = subset['q2_embedding'].tolist()

subset_matrix = np.array(subset_q2_embeddings)

subset_tsne = TSNE(n_components=2, random_state=42)
subset_vis_dims = tsne.fit_transform(subset_matrix)
subset_tsne_x = [x for x, y in subset_vis_dims]
subset_tsne_y = [y for x,y in subset_vis_dims]

colors = [str(x) for x in subset['survey_num'].to_list()]
color_map = {1: 'coral', 2: 'mediumseagreen', 3: 'cornflowerblue', 4: 'orchid'}
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=subset_tsne_x,
    y=subset_tsne_y,
    mode='markers',
    hoverinfo='all',
    text=subset['q2_24'].to_list(),
    marker=dict(
        size=5,
        color=[color_map[int(x)] for x in colors],
        line_width=0.5,
        line_color='black',
    ),
    showlegend=False,
))

fig.update_layout(
    title='t-SNE of Embeddings of Factors of Appropriateness',
    width=900,
    height=850,
)

fig.add_trace(go.Scatter(
    x=[np.mean([subset_tsne_x[i] for i in subset.query('survey_num == 1').index.to_list()])],
    y=[np.mean([subset_tsne_y[i] for i in subset.query('survey_num == 1').index.to_list()])],
    name = 'Ages 7-10',
    mode='markers',
    marker=dict(size=12, color="coral",)
))

fig.add_trace(go.Scatter(
    x=[np.mean([subset_tsne_x[i] for i in subset.query('survey_num == 2').index.to_list()])],
    y=[np.mean([subset_tsne_y[i] for i in subset.query('survey_num == 2').index.to_list()])],
    name = 'Ages 11-14',
    mode='markers',
    marker=dict(size=12, color="mediumseagreen",)
))

fig.add_trace(go.Scatter(
    x=[np.mean([subset_tsne_x[i] for i in subset.query('survey_num == 3').index.to_list()])],
    y=[np.mean([subset_tsne_y[i] for i in subset.query('survey_num == 3').index.to_list()])],
    name = 'Ages 15-18',
    mode='markers',
    marker=dict(size=12, color="cornflowerblue",)
))

fig.add_trace(go.Scatter(
    x=[np.mean([subset_tsne_x[i] for i in subset.query('survey_num == 4').index.to_list()])],
    y=[np.mean([subset_tsne_y[i] for i in subset.query('survey_num == 4').index.to_list()])],
    name = 'Ages 19-22',
    mode='markers',
    marker=dict(size=12, color="orchid",)
))


fig.write_html('../../figures/q2_tsne_subset.html')

fig.write_image('../../figures/q2_tsne_subset.png')

fig.show()



## Similarity within VS between groups

In [102]:
fig = go.Figure()

fig.add_trace(go.Histogram(
    x=df['others_mean'].to_list(),
    name='Between Groups',
    marker_color='darkolivegreen',

))
fig.add_trace(go.Histogram(
    x=df['group_mean'].to_list(),
    name='Within Groups',
    marker_color='darkseagreen',
    opacity=0.75,
))

fig.update_traces(
    marker_line_color='slategrey',
    marker_line_width=1.5,
)
fig.update_layout(
    title='Distribution of Mean Cosine Similarity Scores',
    xaxis_title='Mean Similarity Score',
    yaxis_title='Count',
    barmode='group',
    width=900,
    height=600,
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1,
    ),
)

fig.write_html('../../figures/q2_distribution.html')

fig.write_image('../../figures/q2_distribution.png')

fig.show()

In [103]:
fig = go.Figure()

fig.add_trace(go.Box(
    x=df['others_mean'].to_list(),
    name='Between Groups',
    marker_color='darkolivegreen',

))
fig.add_trace(go.Box(
    x=df['group_mean'].to_list(),
    name='Within Groups',
    marker_color='darkseagreen',
    opacity=0.75,
))

fig.update_layout(
    title='Distribution of Mean Cosine Similarity Scores',
    xaxis_title='Mean Similarity Score',
    width=900,
    height=600,
    showlegend=False,
)

fig.write_html('../../figures/q2_box.html')

fig.write_image('../../figures/q2_box.png')

fig.show()

In [120]:
df['diff'] =  df['group_mean'] - df['others_mean']
df['diff'].mean()

stats.wilcoxon(df['diff'].to_list())

WilcoxonResult(statistic=np.float64(104479.0), pvalue=np.float64(5.910082506750029e-48))

In [None]:
# recalculate the means for the subset with the "none" responses removed
subset['others_mean'], subset['group_mean'] = zip(*subset.apply(lambda row: sim_comparison(row, subset, dim='all'), axis=1))

subset['diff'] =  subset['group_mean'] - subset['others_mean']
subset['diff'].mean()

stats.wilcoxon(subset['diff'].to_list())

WilcoxonResult(statistic=np.float64(80588.0), pvalue=np.float64(4.098071698815577e-40))

## reduced dimension embeddings

In [None]:
# red_embeddings_10 = [get_reduced_embeddings(x, dimensions=10) for x in df['q2_24'].to_list()]
# red_embeddings_100 = [get_reduced_embeddings(x, dimensions=100) for x in df['q2_24'].to_list()]


# embeddings_df = pd.DataFrame({'record_id':df['record_id'].to_list(), 'reduced_embedding_100':red_embeddings_100, 'reduced_embedding_10':red_embeddings_10})
# embeddings_df.to_csv('../../data/reduced_embeddings.csv', index=False)# red_embeddings_df = pd.read_csv('../../data/reduced_embeddings.csv')

# df = df.merge(embeddings_df, how='left', on='record_id')


In [None]:
red_embeddings_df = pd.read_csv('../../data/reduced_embeddings.csv')  

df = df.merge(red_embeddings_df, how='left', on='record_id')

In [None]:
red_embeddings = df['reduced_embedding_100'].tolist()
red_embeddings = [json.loads(str(x)) for x in red_embeddings]
df['reduced_embedding_100'] = red_embeddings

df['reduced_others_mean'], df['reduced_group_mean'] = zip(*df.apply(lambda row: sim_comparison(row, df, dim=100), axis=1))

In [None]:
fig = go.Figure()

fig.add_trace(go.Box(
    y=df['others_mean'].to_list(),
    x=['High Dimensional'] * len(df['others_mean'].to_list()),
    marker_color='lightblue',
    showlegend=False,
    
))

fig.add_trace(go.Box(
    x=['High Dimensional'] * len(df['group_mean'].to_list()),
    y=df['group_mean'].to_list(),
    marker_color='lightgreen',
    showlegend=False,
))
fig.add_trace(go.Box(
    y=df['reduced_others_mean'].to_list(),
    x=['Reduced Dimensions'] * len(df['reduced_others_mean'].to_list()),
    marker_color='lightblue',
    showlegend=False,
    
))

fig.add_trace(go.Box(
    y=df['reduced_group_mean'].to_list(),
    x=['Reduced Dimensions'] * len(df['reduced_group_mean'].to_list()),
    marker_color='lightgreen',
    showlegend=False,
    
))

fig.add_trace(go.Box(
    x=None,
    y=None,
    name='Within Group',
    marker_color='lightgreen',
    showlegend=True,))

fig.add_trace(go.Box(
    x=None,
    y=None,
    name='Between Groups',
    marker_color='lightblue',
    showlegend=True,))

fig.update_layout(
    boxmode='group',
    height=600,
    width=600,
)

fig.show()

In [121]:

tsne = TSNE(n_components=2, perplexity=15, random_state=42, init='random', learning_rate=200)
vis_dims = tsne.fit_transform(np.array(red_embeddings))
x = [x for x, y in vis_dims]
y = [y for x,y in vis_dims]

colors = [str(x) for x in df['survey_num'].to_list()]
color_map = {1: 'coral', 2: 'mediumseagreen', 3: 'cornflowerblue', 4: 'orchid'}
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=x,
    y=y,
    mode='markers',
    hoverinfo='text',
    text=df['q2_24'].to_list(),
    marker=dict(
        size=5,
        color=[color_map[int(x)] for x in colors],

    ),
))

In [100]:

tsne = TSNE(n_components=2, perplexity=15, random_state=42, init='random', learning_rate=200)
vis_dims = tsne.fit_transform(np.array(red_embeddings_10))
x = [x for x, y in vis_dims]
y = [y for x,y in vis_dims]

colors = [str(x) for x in df['survey_num'].to_list()]
color_map = {1: 'coral', 2: 'mediumseagreen', 3: 'cornflowerblue', 4: 'orchid'}
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=x,
    y=y,
    mode='markers',
    hoverinfo='text',
    text=df['q2_24'].to_list(),
    marker=dict(
        size=5,
        color=[color_map[int(x)] for x in colors],

    ),
))

### scree

In [9]:
X

array([[ 0.03291608, -0.02408563, -0.06436042, ...,  0.03526521,
        -0.00539521, -0.00014527],
       [ 0.04943096,  0.02293069,  0.01149121, ...,  0.0063276 ,
        -0.01283627,  0.03856701],
       [-0.00372656, -0.02115919, -0.026524  , ..., -0.02136322,
        -0.006649  , -0.00421564],
       ...,
       [ 0.02369866,  0.04460622, -0.03541355, ...,  0.02341699,
        -0.03871677,  0.02408275],
       [-0.01343912,  0.02851431, -0.03159947, ...,  0.02837408,
        -0.04499963,  0.02592777],
       [ 0.03609044, -0.00096973, -0.02461942, ..., -0.01139663,
        -0.02004724, -0.00445381]], shape=(956, 1536))

In [10]:
pca = PCA()
components = pca.fit_transform(X)
# pca.fit(X)

explained_variance = pca.explained_variance_ratio_.sum()
exp_var_cumul = np.cumsum(pca.explained_variance_ratio_)
print(f"Explained variance: {explained_variance:.2f}")

pca.explained_variance_ratio_.shape

Explained variance: 1.00


(956,)

In [11]:
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=list(range(1, len(pca.explained_variance_ratio_) + 1)),
    y=exp_var_cumul,
    mode='lines+markers',
    name='Explained Variance Ratio',
    marker=dict(size=4, color='blue'),
    line=dict(width=2, color='lightblue'),
    fill='tozeroy',
))

fig.update_xaxes(range=[0,500])

### scatter matrix

In [62]:
df['survey_num'] = df['survey_num'].astype('category')
pca = PCA(n_components=10)
components = pca.fit_transform(X)
labels = {
    str(i): f"PC {i+1} ({var:.1f}%)"
    for i, var in enumerate(pca.explained_variance_ratio_ * 100)
}

fig = px.scatter_matrix(
    components,
    dimensions=range(10),
    color=df["survey_num"]
)
fig.update_traces(diagonal_visible=False)
fig.show()

### 3 dimensional

In [None]:
pca_3 = PCA(n_components=3)
components_3 = pca_3.fit_transform(X)
exp_var_3 = pca_3.explained_variance_ratio_.sum()

df[['c1', 'c2', 'c3']] = components_3

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter3d(
    x=components_3[:, 0],
    y=components_3[:, 1],
    z=components_3[:, 2],
    mode='markers',
    marker_color = df['survey_num'],
    showlegend=False,
))

fig.update_traces(
    marker=dict(
        size=3,
        line=dict(width=1),
        opacity=0.85))



fig.update_layout(
    title="3D PCA of Embeddings of Factors Considered When Determining if Content is Appropriate",
    scene=dict(
        xaxis_title='PC1',
        yaxis_title='PC2',
        zaxis_title='PC3',
        aspectmode='cube'
    ),
    width=800,
    height=800,
    legend=dict(
        orientation='h',
        y=1.02,
    )
)

fig.show()

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter3d(
    x=components_3[:, 0],
    y=components_3[:, 1],
    z=components_3[:, 2],
    mode='markers',
    marker_color = df['cluster'],
    showlegend=False,
))

fig.update_traces(
    marker=dict(
        size=3,
        line=dict(width=1),
        opacity=0.85))


fig.add_trace(go.Scatter3d(
    x=[None], y=[None], z=[None],
    name = 'Individual maturity',
    mode='markers',
    marker=dict(size=10, color="mediumpurple",)
))

fig.add_trace(go.Scatter3d(
    x=[None], y=[None], z=[None],
    name = 'Content - unspecified',
    mode='markers',
    marker=dict(size=10, color="lightskyblue",)
))

fig.add_trace(go.Scatter3d(
    x=[None], y=[None], z=[None],
    name = 'Sex or violence',
    mode='markers',
    marker=dict(size=10, color="lightcoral",)
))

fig.add_trace(go.Scatter3d(
    x=[None], y=[None], z=[None],
    name = 'None or agreement with given factors',
    mode='markers',
    marker=dict(size=10, color="lightgreen",)
))

fig.add_trace(go.Scatter3d(
    x=[None], y=[None], z=[None],
    name = 'Other responses',
    mode='markers',
    marker=dict(size=10, color="lightgrey",)
))


fig.update_layout(
    title="3D PCA of Embeddings of Factors Considered When Determining if Content is Appropriate",
    scene=dict(
        xaxis_title='PC1',
        yaxis_title='PC2',
        zaxis_title='PC3',
        aspectmode='cube'
    ),
    width=800,
    height=800,
    legend=dict(
        orientation='h',
        y=1.02,
    )
)

fig.show()

fig.write_html('../../figures/q2_PCA.html')

fig.write_image('../../figures/q2_PCA.png')

In [90]:
df['cluster'] = np.where(
    (df['c2'] > 0.3) & (df['c1'] > 0),
    'mediumpurple',
    np.where(
        (df['c2']<-0.2) & (df['c3']<0),
        'lightskyblue',
        np.where((df['c2']<-0.2) & (df['c3']>0),
                 'lightcoral',
                 np.where(
                     (df['c3']<0.25) & (df['c1']<-0.3) & (df['c2']>0),
                     'lightgreen',
                     'lightgrey')
                )
        )
)

In [None]:
# purple - depends on individual child
df.query('c2>0.3 and c1>0')['q2_24'].values

array(["Depending on the maturity of the child's emotional development",
       'If it is something my child will benefit from mentally.',
       'Discussions with other parents with children around the same age, monitoring things myself to decide if I feel they are appropriate for him',
       'the childs maturity level in general.some children are more mature than others , so knowing where your child is makes it easier to know the level which is appropriate',
       'regardless of age, is it something my child.is ready to handle.',
       'I want to see it myself and guesstimate how I think it will affect my child.  I know him better than those on-line.',
       "My own child's maturity",
       "I think of my son's maturity level and what he has seen and experienced.",
       "I CAN'T THINK OF ANY BECAUSE MY SON IS 18 YEARS OLD NOW AND IS LEGALLY AN ADULT. MY DAYS OF HOVERING ARE OVERWITH AND I HAVE TO HAVE FAITH IN THE JOB I DID IN RAISING HIM.",
       'Depending on how mature i f

In [None]:
# blue - content, unspecified
df.query('c2<-0.2 and c3<0')['q2_24'].values

array(['The content creator',
       'Other factors include the quality of the content, such as whether they are curated reviewed, and approved, and who developed the contents.',
       "The source of the content, it's authenticity.",
       'The reputation of the content creator',
       'using to set examples of good and bad content',
       'Content and language',
       'what is the content motive is it educational or just for entertainment',
       'The actual content. Whether it is positive or toxic.',
       'esplicit contents',
       'the morals and values of the content creators.',
       'The themes in the content.', 'The content to promote culture',
       "The content's entertainment value", 'The creator of the content',
       'is the content beneficial or just for fun and does the content encourage creativity and learning or just scrolling',
       'The style of illustration or content matter',
       'The actors in the content also matters',
       'subject of said cont

In [None]:
# red - sex or violence
df.query('c2<-0.2 and c3>0')['q2_24'].values

array(['SEXUAL CONTENT AND USE OF WORDS',
       'language,substance use,sexual content',
       'If the content is sexual in nature or uses foul language.',
       'The level of violence, sexual scenes and frightening scenes.',
       'if the content has features of a "rated R" rating like language, no sex, no drugs, etc.',
       'profanity and themes of the content',
       'Language, nudity, and graphic violence.',
       'The amount of violence potentially portrayed in the content.',
       'The Graphic images/video content;  Whether or not the video/graphics contain weapons, knives or tools or toys used in an aggressive way.  Also, any content that depicts one person being mean or bullying towards another, especially one child towards another depicted in video or graphics. This includes mistreating animals, insects, or even plants.',
       'If violence or sexual situations are portrayed as well as discriminatory language and behavior.',
       'Mostly just the subject matter, th

In [71]:
df.query('c3<0.25 and c1<-0.3 and c2>0')['q2_24'].values

array(["can't think of any right now", "I can't think of other factors.",
       'No factors', 'NO', 'Known sesame streets', 'nothing',
       "can't think of anything else", 'the above is a good list.',
       'interest', 'Nothing additional - this list covers it.', 'none',
       'none', 'none', 'none', 'Not sure', 'N/a',
       'Those about sum it up above.', "That's it", 'No other factors.',
       "I don't consider any other factors.",
       'these pretty much cover it.', 'N/a', 'none', 'not applicable',
       'Everthing already listed',
       'These pretty much are the most important ones that come to mind.',
       'not sure', 'no other considerations',
       "I can't think of other factors other than the above",
       'No other factors. The ones already mentioned will suffice.',
       "I can't think of any right now.", "Couldn't come up with any.",
       "These are perfect. can't think of any off hand", 'none', 'na',
       'happiness',
       "Can't think of any other f