In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from qwikidata.linked_data_interface import get_entity_dict_from_api
from qwikidata.entity import WikidataItem
import itertools
from collections import Counter
from tqdm.notebook import tqdm

from datetime import datetime

 
pd.options.plotting.backend = "plotly"

In [4]:
df = pd.read_json('data\\clean_speakers_5.jsonl')
print(df.shape)
df.head(2)

(326437, 5)


Unnamed: 0,n_quotes,age,nationality,gender,occupation
Q270316,21060,74,Q30,Q6581072,Q82955
Q1253,94704,77,Q884,Q6581097,Q82955


In [5]:
df_male = df[df['gender']=='Q6581097']
df_female = df[df['gender']=='Q6581072']

In [6]:
ctr = Counter(df_male['occupation'])
relevant_occupations_male = pd.DataFrame.from_dict(ctr, orient='index').reset_index().rename(columns={'index': 'qid', 0: 'count'}).sort_values('count', ascending=False)[0:10]
relevant_occupations_male['meaning'] = relevant_occupations_male['qid'].apply(lambda s: get_entity_dict_from_api(s)['labels']['en']['value'])
relevant_occupations_male['count'] = relevant_occupations_male['count'].divide(len(df_male))
relevant_occupations_male = relevant_occupations_male.sort_values('count', ascending=True)
relevant_occupations_male['meaning'] = relevant_occupations_male['meaning'].map(lambda x: x.capitalize())
relevant_occupations_male.head(10)

Unnamed: 0,qid,count,meaning
53,Q36180,0.021606,Writer
23,Q12299841,0.022898,Cricketer
11,Q11774891,0.024293,Ice hockey player
13,Q1930187,0.024721,Journalist
8,Q3665646,0.024729,Basketball player
22,Q10871364,0.026152,Baseball player
5,Q19204627,0.042082,American football player
36,Q33999,0.046136,Actor
0,Q82955,0.104647,Politician
4,Q937857,0.109527,Association football player


In [10]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

import numpy as np

y_saving = (relevant_occupations_male['count']*100).tolist()

x = relevant_occupations_male['meaning'].tolist()


# Creating two subplots
fig = go.Figure()

fig.add_trace(go.Bar(
    x=y_saving,
    y=x,
    marker=dict(
        color='rgba(138,43,226, 0.8)',
        line=dict(
        color='rgba(138,43,226, 1.0)',
        width=1),
    ),
    name='Top 10 male occupations in Quotebank\'s speakers',
    orientation='h',
))

fig.update_layout(
    autosize=False,
    width=1000,
    height=500,
    title='Top 10 occupations for males',
    yaxis=dict(
        showgrid=False,
        showline=False,
        showticklabels=True,
        domain=[0, 0.95],
        ticksuffix="  ",
        tickfont=dict(family='Nunito', size=14),
    ),
    xaxis=dict(
        zeroline=False,
        showline=False,
        showticklabels=False,
        showgrid=False,
        domain=[0.15, 0.9],
    ),
    legend=dict(x=0.029, y=1.038, font_size=13),
    margin=dict(l=100, r=20, t=70, b=70),
    template=None,
    font_family='Nunito',
)

annotations = []

y_s = np.round(y_saving, decimals=2)

# Adding labels
for yd, xd in zip(y_s, x):
    # labeling the bar net worth
    annotations.append(dict(xref='x1', yref='y1',
                            y=xd, x=yd + 0.5,
                            text=str(yd) + '%',
                            font=dict(family='Nunito', size=14,
                                      color='rgb(138,43,226)'),
                            showarrow=False))

fig.update_layout(annotations=annotations)

fig.show()
fig.write_html("images/top10_occupations_males.html")

In [11]:
ctr = Counter(df_female['occupation'])
relevant_occupations_female = pd.DataFrame.from_dict(ctr, orient='index').reset_index().rename(columns={'index': 'qid', 0: 'count'}).sort_values('count', ascending=False)[0:10]
relevant_occupations_female['meaning'] = relevant_occupations_female['qid'].apply(lambda s: get_entity_dict_from_api(s)['labels']['en']['value'])
relevant_occupations_female['count'] = relevant_occupations_female['count'].divide(len(df_female))
relevant_occupations_female = relevant_occupations_female.sort_values('count', ascending=True)
relevant_occupations_female['meaning'] = relevant_occupations_female['meaning'].map(lambda x: x.capitalize())
relevant_occupations_female.head(10)

Unnamed: 0,qid,count,meaning
62,Q6625963,0.014811,Novelist
14,Q40348,0.015395,Lawyer
11,Q11513337,0.0172,Athletics competitor
7,Q4610556,0.020947,Model
46,Q937857,0.022155,Association football player
1,Q1930187,0.038419,Journalist
52,Q36180,0.040537,Writer
19,Q177220,0.05342,Singer
0,Q82955,0.099807,Politician
10,Q33999,0.13828,Actor


In [14]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

import numpy as np

y_saving = (relevant_occupations_female['count']*100).tolist()

x = relevant_occupations_female['meaning'].tolist()


# Creating two subplots
fig = go.Figure()

fig.add_trace(go.Bar(
    x=y_saving,
    y=x,
    marker=dict(
        color='rgba(218,165,32, 0.8)',
        line=dict(
        color='rgba(218,165,32, 1.0)',
        width=1),
    ),
    name='Top 10 male occupations in Quotebank\'s speakers',
    orientation='h',
))

fig.update_layout(
    autosize=False,
    width=1000,
    height=500,
    title='Top 10 occupations for females',
    yaxis=dict(
        showgrid=False,
        showline=False,
        showticklabels=True,
        domain=[0, 0.95],
        ticksuffix="  ",
        tickfont=dict(family='Nunito', size=14),
    ),
    xaxis=dict(
        zeroline=False,
        showline=False,
        showticklabels=False,
        showgrid=False,
        domain=[0.15, 0.9],
    ),
    legend=dict(x=0.029, y=1.038, font_size=13),
    margin=dict(l=100, r=20, t=70, b=70),
    template=None,
    font_family='Nunito',
)

annotations = []

y_s = np.round(y_saving, decimals=2)

# Adding labels
for yd, xd in zip(y_s, x):
    # labeling the bar net worth
    annotations.append(dict(xref='x1', yref='y1',
                            y=xd, x=yd + 0.6,
                            text=str(yd) + '%',
                            font=dict(family='Nunito', size=14,
                                      color='rgb(218,165,32)'),
                            showarrow=False))

fig.update_layout(annotations=annotations)

fig.show()
fig.write_html("images/top10_occupations_females.html")

In [13]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

import numpy as np

y_saving_f = (relevant_occupations_female['count']*100).tolist()

x_f = relevant_occupations_female['meaning'].tolist()


y_saving = (relevant_occupations_male['count']*100).tolist()

x = relevant_occupations_male['meaning'].tolist()


# Creating two subplots
fig = go.Figure()

fig.add_trace(go.Bar(
    x=y_saving_f,
    y=x_f,
    marker=dict(
        color='rgba(218,165,32, 0.8)',
        line=dict(
        color='rgba(218,165,32, 1.0)',
        width=1),
    ),
    name='female',
    orientation='h',
))

fig.add_trace(go.Bar(
    x=y_saving,
    y=x,
    marker=dict(
        color='rgba(138,43,226, 0.8)',
        line=dict(
        color='rgba(138,43,226, 1.0)',
        width=1),
    ),
    name='male',
    orientation='h',
))

fig.update_layout(
    barmode='stack',
    autosize=False,
    width=1000,
    height=500,
    title='Top 10 occupations for females',
    yaxis=dict(
        showgrid=False,
        showline=False,
        showticklabels=True,
        domain=[0, 0.95],
        ticksuffix="  ",
        tickfont=dict(family='Nunito', size=14),
    ),
    xaxis=dict(
        zeroline=False,
        showline=False,
        showticklabels=False,
        showgrid=False,
        domain=[0.15, 0.9],
    ),
    legend=dict(x=0.929, y=1.038, font_size=13),
    margin=dict(l=100, r=20, t=70, b=70),
    template=None,
    font_family='Nunito',
)

# annotations = []

# y_s = np.round(y_saving, decimals=2)

# # Adding labels
# for yd, xd in zip(y_s, x):
#     # labeling the bar net worth
#     annotations.append(dict(xref='x1', yref='y1',
#                             y=xd, x=yd + 0.6,
#                             text=str(yd) + '%',
#                             font=dict(family='Nunito', size=14,
#                                       color='rgb(218,165,32)'),
#                             showarrow=False))

# fig.update_layout(annotations=annotations)

fig.show()

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

import numpy as np

y_saving = (relevant_occupations_male['count']*100).tolist()

x = relevant_occupations_male['meaning'].tolist()


# Creating two subplots
fig = go.Figure()

fig.add_trace(go.Bar(
    x=y_saving,
    y=x,
    marker=dict(
        color='rgba(138,43,226, 0.8)',
        line=dict(
        color='rgba(138,43,226, 1.0)',
        width=1),
    ),
    name='Top 10 male occupations in Quotebank\'s speakers',
    orientation='h',
))

fig.update_layout(
    autosize=False,
    width=1000,
    height=500,
    title='Top 10 occupations for males',
    yaxis=dict(
        showgrid=False,
        showline=False,
        showticklabels=True,
        domain=[0, 0.95],
        ticksuffix="  ",
        tickfont=dict(family='Nunito', size=14),
    ),
    xaxis=dict(
        zeroline=False,
        showline=False,
        showticklabels=False,
        showgrid=False,
        domain=[0.15, 0.9],
    ),
    legend=dict(x=0.029, y=1.038, font_size=13),
    margin=dict(l=100, r=20, t=70, b=70),
    template=None,
    font_family='Nunito',
)

annotations = []

y_s = np.round(y_saving, decimals=2)

# Adding labels
for yd, xd in zip(y_s, x):
    # labeling the bar net worth
    annotations.append(dict(xref='x1', yref='y1',
                            y=xd, x=yd + 0.5,
                            text=str(yd) + '%',
                            font=dict(family='Nunito', size=14,
                                      color='rgb(138,43,226)'),
                            showarrow=False))

fig.update_layout(annotations=annotations)

fig.show()
fig.write_html("images/top10_occupations_males.html")