In [82]:
import pandas as pd
data = pd.read_pickle("./added_sentiment.pkl")
for col in ["nationality", "gender", "occupation"]:
    first = pd.Series([elem[0] if elem[0] is not None else "F" for elem in data[col]])  # keep 1st element
    data[col] = first.values
    data = data[data[col] != "F"]  # filter NaN (~10 raws)
data.head()

# MODEL REGRESSION

In [59]:
from torch import nn as nn
import torch
import numpy as np

class LinearRegressionModel(torch.nn.Module):

    def __init__(self):
        super(LinearRegressionModel, self).__init__()
        self.linear = torch.nn.Linear(1, 1)  # One in and one out

    def forward(self, x):
        y_pred = self.linear(x)
        return y_pred

model = LinearRegressionModel()
X_train = data.quote_year[:15000].to_numpy()
y_train = data.Sentiment[:15000].to_numpy()
X_train = torch.from_numpy(X_train.astype(np.float32)).view(-1,1).unsqueeze(2).unsqueeze(3)
y_train = torch.from_numpy(y_train.astype(np.float32)).view(-1,1)
learning_rate = 0.0005
l = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr =learning_rate)
num_epochs = 40
for epoch in range(num_epochs):
    #forward feed
    y_pred = model(X_train.requires_grad_())

    #calculate the loss
    loss= l(y_pred, y_train)

    #backward propagation: calculate gradients
    loss.backward()

    #update the weights
    optimizer.step()

    #clear out the gradients from the last step loss.backward()
    optimizer.zero_grad()

    print('epoch {}, loss {}'.format(epoch, loss.item()))

In [89]:
params = []
for i in model.parameters():
    params.append(i.item())


# Interactive plots

In [None]:
# load data
import pandas as pd
data = pd.read_pickle("./added_sentiment.pkl")
for col in ["nationality", "gender", "occupation"]:
    first = pd.Series([elem[0] if elem[0] is not None else "F" for elem in data[col]])  # keep 1st element
    data[col] = first.values
    data = data[data[col] != "F"]  # filter NaN (~10 raws)
data.head()

In [47]:
countries = data.groupby(["nationality"]).mean()
countries_years = [data[data["quote_year"] == year].groupby(["nationality"]).mean()["Sentiment"] for year in range(2015, 2021)]
all_years = pd.concat(countries_years, axis=1).dropna()

nationality
Afghanistan        0.077200
Akkadian empire   -0.544550
Argentina          0.403820
Armenia           -0.972300
Australia          0.104444
Name: Sentiment, dtype: float64

In [49]:
import geopandas
import bokeh
import geopandas as gpd
world = geopandas.read_file(geopandas.datasets.get_path('naturalearth_lowres'))
world = world[world.name.isin(all_years.index)]
for i in range(2015, 2021):
    data_cur = countries_years[i - 2015]
    world[f'sentiment_{i}'] = data_cur[data_cur.index.isin(world.name.values)].values

world["mean"] = world.iloc[:,  6:].mean(axis=1)
world_gpd = gpd.GeoDataFrame(world)

In [163]:
world_gpd.head()

Unnamed: 0,pop_est,continent,name,iso_a3,gdp_md_est,geometry,sentiment_2015,sentiment_2016,sentiment_2017,sentiment_2018,sentiment_2019,sentiment_2020,mean
3,35623680,North America,Canada,CAN,1674000.0,"MULTIPOLYGON (((-122.84000 49.00000, -122.9742...",-0.052133,-0.078586,-0.149888,0.109549,0.153865,0.40382,0.064438
4,326625791,North America,United States of America,USA,18560000.0,"MULTIPOLYGON (((-122.84000 49.00000, -120.0000...",0.196858,0.154295,0.146679,0.1285,0.095186,0.104444,0.13766
8,260580739,Asia,Indonesia,IDN,3028000.0,"MULTIPOLYGON (((141.00021 -2.60015, 141.01706 ...",0.319318,0.060875,0.050657,0.229069,0.24925,-0.09368,0.135915
9,44293293,South America,Argentina,ARG,879400.0,"MULTIPOLYGON (((-68.63401 -52.63637, -68.25000...",0.325583,0.15,0.040283,0.307546,0.253975,0.5574,0.272465
10,17789267,South America,Chile,CHL,436100.0,"MULTIPOLYGON (((-68.63401 -52.63637, -68.63335...",0.004757,0.149151,0.080823,0.03928,0.013578,-0.375425,-0.014639


In [171]:

import pandas_bokeh
from bokeh.io import show
from bokeh.models import Panel, Tabs
pandas_bokeh.output_notebook()


slider_cols = [f'sentiment_{i}' for i in range(2015, 2021)]

html1 = world_gpd.plot_bokeh(figsize=(700, 400),
    simplify_shapes=5000,
    show_colorbar=True,
    colormap="Colorblind",
    slider=slider_cols,
    slider_range=range(2015, 2021),
    slider_name="Year",
    hovertool_columns=["name"] + slider_cols,
    title="Sentiment regarding LGBT community",
    legend="Per year")

html2 = world_gpd.plot_bokeh(figsize=(700, 400),
    simplify_shapes=5000,
    figure=html1,
    colormap="Colorblind",
    category="mean",
    legend="Mean",
    show_colorbar=False,
    return_html=True)

#Write some HTML and embed the HTML plot below it. For production use, please use
#Templates and the awesome Jinja library.


Iteration over multi-part geometries is deprecated and will be removed in Shapely 2.0. Use the `geoms` property to access the constituent parts of a multi-part geometry.




Iteration over multi-part geometries is deprecated and will be removed in Shapely 2.0. Use the `geoms` property to access the constituent parts of a multi-part geometry.



In [172]:
html = html2

#Export the HTML string to an external HTML file and show it:
with open("sentiment.html" , "w") as f:
    f.write(html)

import webbrowser
webbrowser.open("sentiment.html")

True

In [None]:
from bokeh.io import show
from bokeh.models import Panel, Tabs
from bokeh.plotting import figure

tabs = []
for elem in ["United States of America", "United Kingdom", "Russia", "Canada", "India", "Philippines", "South Africa"]:
    p1 = figure(width=300, height=300)
    points = data[data.nationality == elem].groupby("quote_year")["Sentiment"].agg(["min", "max", "mean"])
    p1.circle(points.index, points["max"].values, color="red")
    p1.line(points.index, points["mean"].values, line_width=3, color="navy", alpha=0.5)
    p1.circle(points.index, points["min"].values, color="navy")
    tabs.append(Panel(child=p1, title=elem))

show(Tabs(tabs=tabs))

In [None]:
# !pip3 install git+https://github.com/PatrikHlobil/Pandas-Bokeh
from bokeh.io import show
from bokeh.models import Panel, Tabs
from bokeh.plotting import figure

for elem in ["United States of America", "United Kingdom", "Russia", "Canada", "India", "Philippines", "South Africa"]:
    p1 = figure(width=300, height=300)
    points = data[data.nationality == elem].groupby("quote_year")["Sentiment"].agg(["min", "max", "mean"])
    p1.circle(points.index, points["max"].values, color="red")
    p1.line(points.index, points["mean"].values, line_width=3, color="navy", alpha=0.5)
    p1.circle(points.index, points["min"].values, color="navy")
    tabs.append(Panel(child=p1, title=elem))

show(Tabs(tabs=tabs))

In [7]:
from bokeh.io import show
from bokeh.models import Panel, Tabs
from bokeh.plotting import figure
from bokeh.plotting import figure, output_file, show
tabs = []
data = {"Loss": losses, "Accuracy": accs}
for elem in ["Loss", "Accuracy"]:
    p1 = figure(width=500, height=300)
    # p1.circle(points.index, points["max"].values, color="red")
    p1.line(range(3), data[elem][0], line_width=3, color="navy", alpha=0.5, legend_label="Train")
    # p1.circle(points.index, points["min"].values, color="navy")
    p1.line(range(3), data[elem][1], line_width=3, color="orange", alpha=0.5, legend_label="Validation")
    tabs.append(Panel(child=p1, title=elem))
    p1.xaxis.axis_label = "Epoch"
    p1.legend.location = "top_left"
    p1.add_layout(p1.legend[0], 'right')
output_file("balanced.html", title="Balanced data")
show(Tabs(tabs=tabs))

In [40]:
import numpy as np
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
import string

#Define data for heatmap
N=4
x = ["USA", "UK", "Australia", "Canada"]
y = ["USA", "UK", "Australia", "Canada"]
z2 = unbalanced_data
z1 = balanced_data

fig1 = ff.create_annotated_heatmap(z1, x, y,  colorscale='blues', name='Unbalanced')
fig2 = ff.create_annotated_heatmap(z2, x, y, colorscale='blues', name='Balanced')
fig1.layout.update({'title': 'Unbalanced classes'})
fig2.layout.update({'title': 'Balanced classes'})
fig = make_subplots(
    rows=1, cols=2,
    horizontal_spacing=0.13,
)

fig.add_trace(fig1.data[0], 1, 1)
fig.add_trace(fig2.data[0], 1, 2)

annot1 = list(fig1.layout.annotations)
annot2 = list(fig2.layout.annotations)
for k  in range(len(annot2)):
    annot2[k]['xref'] = 'x2'
    annot2[k]['yref'] = 'y2'
fig.layout.xaxis1.update({'title': 'Unbalanced classes'})
fig.layout.xaxis2.update({'title': 'Balanced classes'})
fig.update_layout(annotations=annot1+annot2)
fig.layout.update({'title': 'Confusion matrix'})
fig.write_html("conf.html")
fig.show()

In [153]:
data["round"] = data.Sentiment.round(2)
median_data = data.groupby(["occupation", "quote_year"]).median()["round"]
max_data = data.groupby(["occupation", "quote_year"]).max()["Sentiment"]
min_data = data.groupby(["occupation", "quote_year"]).min()["Sentiment"]
std_data = data.groupby(["occupation", "quote_year"]).std()["Sentiment"]
mean_data = data.groupby(["occupation", "quote_year"]).mean()["Sentiment"]
counts = data.groupby("occupation").quote_year.value_counts()

In [159]:
from bokeh.palettes import Spectral8
from bokeh.models import Title
from bokeh.plotting import figure, output_file, show

p = figure(width=800, height=250, x_axis_type="linear", x_axis_label="Year", y_axis_label='Sentiment')
p.add_layout(Title(text='Occupation sentiment', text_font_size="16pt"), 'above')

for idx, occ in enumerate(["politician", "actor", "journalist", "writer", "lawyer", "LGBTIQ+ rights activist", "singer", "film director"]):
    ar = p.varea(x=median_data[occ].index, y1=mean_data[occ].values - 1.96 * std_data[occ].values / np.sqrt(counts[occ].values),
                 y2=mean_data[occ].values + 1.96 * std_data[occ].values / np.sqrt(counts[occ].values), alpha=0.3, color=Spectral8[idx] if idx!= 5 else "violet", legend_label=country)
    line = p.line(mean_data[occ].index, mean_data[occ].values, line_width=2, color=Spectral8[idx] if idx!= 5 else "violet", alpha=0.8, legend_label=occ)
    scatter = p.scatter(median_data[occ].index, median_data[occ].values, color=Spectral8[idx] if idx!= 5 else "violet", alpha=1, legend_label=occ)
    if idx > 0:
        line.visible = False
        ar.visible = False
        scatter.visible = False

p.legend.location = "top_left"
p.add_layout(p.legend[0], 'right')
p.legend.click_policy="hide"

output_file("median_mean_countries.html")

show(p)


In [257]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from bokeh.palettes import Spectral10

nats = data.nationality.value_counts()[:10]
occs = data.occupation.value_counts()[:10]
# data_ = data.copy()
# data_["nationality"][~data_.nationality.isin(nats.index)] = "Other"
# data_["occupation"][~data_.occupation.isin(occs.index)] = "Other"
# nats = data_.nationality.value_counts()[:10]
# occs = data_.occupation.value_counts()[:10]
fig = make_subplots(rows=1, cols=2, subplot_titles=("Nationality", "Occupation"))
# fig.add_trace(go.Pie(labels=nats.index[:10], values=nats.values[:10], name="Nationality"),
#               1, 1)
# fig.add_trace(go.Pie(labels=occs.index[:10], values=occs.values[:10], name="Occupation"),
#               1, 2)
fig.add_trace(go.Bar(x=nats.index, y=nats.values, marker={'color': Spectral10[::-1]}),
              1, 1)
fig.add_trace(go.Bar(x=occs.index, y=occs.values, marker={'color': Spectral10[::-1]}),
              1, 2)
fig.update_yaxes(title_text='Quotes')
# Use `hole` to create a donut-like pie chart
# fig.update_traces(hole=.4, hoverinfo="label+percent+name")

fig.update_layout(template="plotly_white",
    title_text="""Who speaks about LGBT issues more?""", showlegend=False)
    # Add annotations in the center of the donut pies.)
fig.write_html("occ_nats_dictribution.html")
fig.show()

In [275]:
data.drop(data[data.date_of_birth == ''].index, inplace=True)
data["age"] = data.quote_year.astype(int) - data.date_of_birth.astype(int)
data["round_"] = data.Sentiment.round(1)
younger = data[data.age < 30]["round_"].value_counts()
older = data[data.age > 60]["round_"].value_counts()

In [276]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from bokeh.palettes import Spectral10

fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])
fig.add_trace(go.Pie(labels=younger.index, values=younger.values, name="<30>"),
              1, 1)
fig.add_trace(go.Pie(labels=older.index, values=older.values, name=">60"),
              1, 2)
# Use `hole` to create a donut-like pie chart
fig.update_traces(hole=.4, hoverinfo="label+percent+name")

fig.update_layout(template="plotly_white",
    title_text="Sentiment depending on age", showlegend=True)
    # Add annotations in the center of the donut pies.)
# fig.write_html("occ_nats_dictribution.html")
fig.show()

In [296]:
data.gender[~data.gender.isin(["male", "female"])] = "other"
new_data = pd.DataFrame({gender: data[data.gender == gender]["round_"] for gender in ["male", "female", "other"]})
p_stacked_bar = new_data.plot_bokeh.hist(
    ylabel="How many people have this sentiment",
    title="Sentiment distribution by genders",
    hovertool=False,
    alpha=0.6)
new_data = pd.DataFrame({"boomers": data[(data.age >= 40) & (data.age <= 60)]["round_"], "older": data[data.age > 60]["round_"], "younger": data[data.age < 40]["round_"]})
p_bar = new_data.plot_bokeh.hist(
    ylabel="How many people have this sentiment",
    title="Sentiment distribution by age",
    hovertool=False,
    alpha=0.6)
html = pandas_bokeh.plot_grid([[p_stacked_bar, p_bar]], plot_width=400, plot_height=260, return_html=True)
with open("gender_age.html" , "w") as f:
    f.write(html)


There are NaN values in column 'male'. For the histogram, these rows have been neglected.


There are NaN values in column 'female'. For the histogram, these rows have been neglected.


There are NaN values in column 'other'. For the histogram, these rows have been neglected.




There are NaN values in column 'boomers'. For the histogram, these rows have been neglected.


There are NaN values in column 'older'. For the histogram, these rows have been neglected.


There are NaN values in column 'younger'. For the histogram, these rows have been neglected.




plot_width and plot_height was deprecated in Bokeh 2.4.0 and will be removed, use width or height instead.



## Average portrait of social groups

In [225]:
data[data["round"] >= 0.8][["nationality", "occupation", "age"]].value_counts()[:30]

nationality               occupation               age
United States of America  politician               37     531
                                                   57     439
                                                   59     438
                                                   62     406
                                                   69     405
                                                   68     404
                                                   45     399
                                                   60     396
                                                   58     376
                          LGBTIQ+ rights activist  46     364
                          politician               63     362
                                                   53     345
                                                   61     339
                                                   54     339
                                                   50     326
               

In [298]:
data.occupation.value_counts()

politician              24408
actor                   11572
journalist               6400
writer                   4718
lawyer                   3579
                        ...  
sledge hockey player        1
agronomist                  1
kickboxer                   1
discus thrower              1
country singer              1
Name: occupation, Length: 659, dtype: int64

In [302]:
from scipy import stats
data_dict = {occ: data[data.occupation == occ]["round"] for occ in ["politician", "actor", "journalist", "lawyer", "writer"]}
stats.mstats.kruskalwallis(*[data_dict[col].values for col in data_dict.keys()])

KruskalResult(statistic=579.0185026698815, pvalue=5.381330338139509e-124)