# Imports

In [None]:
import numpy as np
import pandas as pd

from sklearn.manifold import TSNE
from sklearn.preprocessing import OrdinalEncoder

from matplotlib import pyplot as plt
import matplotlib.colors as colors
from matplotlib.lines import Line2D

import plotly.express as px


In [None]:
PREDICTION_WINDOW = 14
SEARCH_BEST_PERPLEXITY: bool = False
PLOT_INTERACTIVE = True
perplexity_list = [10, 20, 30, 40, 50]

# Functions

In [None]:
def interactive_plots(res, region, ged_sb_14, gw_statename, date, title_prefix):
    # Create a DataFrame to work easily with Plotly
    data = pd.DataFrame({
        'x': res[:, 0],
        'y': res[:, 1],
        'region': region,
        'ged_sb_14': ged_sb_14,
        'gw_statename': gw_statename,
        'date': date
    })

    # Get the x and y axis limits based on min and max values of res
    x_min, x_max = data['x'].min(), data['x'].max()
    y_min, y_max = data['y'].min(), data['y'].max()

    # Generate interactive plots for each unique region
    for reg in data['region'].unique():
        filtered_data = data[data['region'] == reg]

        fig = px.scatter(
            filtered_data,
            x='x',
            y='y',
            color='ged_sb_14',
            hover_data=['gw_statename', 'date'],  # Show 'gw_statename' and 'date' on hover
            title=f"{title_prefix}\nin {reg}",
            labels={'ged_sb_14': 'GED SB 14'},
            color_continuous_scale='inferno',
            size=filtered_data['ged_sb_14'] + 10,  # Optional: scale point size by 'ged_sb_14'
        )

        # Update marker opacity
        fig.update_traces(marker=dict(opacity=0.7))

        # Set the x and y axis limits
        fig.update_layout(
            xaxis_range=[x_min, x_max],  # Set x-axis limits
            yaxis_range=[y_min, y_max],  # Set y-axis limits
        )

        fig.show()

In [None]:
def noninteractive_plots(res, region, ged_sb_14, titile_prefix):
    masks = [(reg, region == reg) for reg in region.unique()]

    for reg, mask in masks:
        plt.xlim(res[:, 0].min(), res[:, 0].max())
        plt.ylim(res[:, 1].min(), res[:, 1].max())

        # Define normalization and colormap
        norm = colors.Normalize(vmin=ged_sb_14[mask].min(), vmax=ged_sb_14[mask].max())
        cmap = plt.cm.Reds

        plt.scatter(
            res[:, 0][mask],
            res[:, 1][mask],
            c=ged_sb_14[mask],
            alpha=(ged_sb_14[mask] > 0) * 0.7 + 0.10,
            s=ged_sb_14[mask] / ged_sb_14[mask].max() * 50 + 5,
            cmap=cmap,
            norm=norm,
            edgecolors='black',  # Adds a thin black border
            linewidths=0.5  # Sets the border thickness
        )

        # Create sample values for legend
        sample_values = np.linspace(ged_sb_14[mask].min(), ged_sb_14[mask].max(), num=4)
        sample_sizes = sample_values / ged_sb_14[mask].max() * 50 + 5

        # Create legend entries
        legend_elements = []

        for val, size in zip(sample_values, sample_sizes):
            handle = plt.scatter([], [], s=size, c=[cmap(norm(val))],
                                 edgecolors='black', linewidths=0.5)
            legend_elements.append((handle, int(val)))

        plt.legend([h for h, _ in legend_elements], [l for _, l in legend_elements],
                   title='Size & Color\n(battle-related\nged_sb deaths)', 
                   # loc="center left",
                   # bbox_to_anchor=(1.3, 0.5)
                   )

        plt.title(f"{titile_prefix} in {reg}")
        plt.show()

In [None]:
def region_plot(res, region, title):
    ordinal_encoder = OrdinalEncoder()
    labels = ordinal_encoder.fit_transform(np.array(region)[..., np.newaxis])

    scatter = plt.scatter(res[::, 0], res[::, 1], c=labels, cmap="tab20", s=5)
    plt.title(title)

    regions = region.unique()  # Assuming `region` is a pandas Series or similar
    colors = plt.cm.tab20(np.arange(len(regions)))  # Get colors from the "tab10" colormap

    # Create legend entries
    legend_elements = [
        Line2D(
            [0],
            [0],
            marker="o",
            color="w",
            label=reg,
            markerfacecolor=colors[i],
            markersize=10,
        )
        for i, reg in enumerate(regions)
    ]

    # Add the legend to the plot
    plt.legend(
        handles=legend_elements, title="Regions" if len(region.unique()) < 10 else "Regions23", loc="center left",
        bbox_to_anchor=(1, 0.5)
    )

    plt.show()

In [None]:
def search_best_perplexity(data, perplexity_list, n):
    pt = np.array(
        [
            [perplexity, TSNE(n_components=n, perplexity=perplexity, n_jobs=-1).fit(data).kl_divergence_]
            for perplexity in perplexity_list
        ]
    )
    plt.plot(pt[::, 0], pt[::, 1])
    plt.show()

    min_kl_divergence = pt[::, 1].argmin()
    min_perplexity = pt[min_kl_divergence, 0]

    print(f"Perplexity: {min_perplexity}, KL Divergence: {min_kl_divergence}")

    return min_perplexity

# Regular Data

In [None]:
# Data
data = pd.read_csv("../data/cm_features_v3.6.csv")
data = data.loc[::, ~data.columns.str.startswith("stock_topic")]

data["ged_sb_14"] = data.groupby("ccode")["ged_sb"].shift(-PREDICTION_WINDOW)

data["armed_conflict_14"] = data["ged_sb_14"] / data["wdi_sp_pop_totl"] * 1_000_000

data = data.loc[data["ged_sb_14"].notna()]

ac = data.pop("armed_conflict_14")
gw_statename = data.pop("gw_statename")
gw = data.pop("gleditsch_ward")
ccode = data.pop("ccode")
region = data.pop("region")
region23 = data.pop("region23")
ged_sb_14 = data.pop("ged_sb_14")
country_id = data.pop("country_id")
date = data.pop("date")
month_id = data.pop("month_id")

In [None]:
# T-SNE
if SEARCH_BEST_PERPLEXITY:
    min_perplexity = search_best_perplexity(data, perplexity_list=perplexity_list, n=2)
else:
    min_perplexity = 20

tsne = TSNE(perplexity=min_perplexity, n_jobs=-1)

res = tsne.fit_transform(data)

In [None]:
# s = ged_sb_14 > 10000
# plt.scatter(res[::, 0], res[::, 1], c=s, s=s * 10, alpha=1)
# plt.title("ged_sb_14 > 10000")
# plt.show()

In [None]:
title_prefix = "t-SNE based on all cm_features excluding\ncountry_id, month_id and dependent variable"

region_plot(
    res,
    region,
    title_prefix
)
print("-" * 96)
print("-" * 96)
region_plot(
    res,
    region23,
    title_prefix
)
print("-" * 96)
print("-" * 96)
if PLOT_INTERACTIVE:
    interactive_plots(res, region, ged_sb_14, gw_statename, date, title_prefix)
    print("-" * 96)
    print("-" * 96)
# noninteractive_plots(res, region23, ged_sb_14, title_prefix)

# without GED/ACLED

In [None]:
# Data
data = pd.read_csv("../data/cm_features_v3.6.csv")
data = data.loc[::, ~data.columns.str.startswith("stock_topic")]

data["ged_sb_14"] = data.groupby("ccode")["ged_sb"].shift(-PREDICTION_WINDOW)

data["armed_conflict_14"] = data["ged_sb_14"] / data["wdi_sp_pop_totl"] * 1_000_000

data = data.loc[data["ged_sb_14"].notna()]

ac = data.pop("armed_conflict_14")
gw_statename = data.pop("gw_statename")
gw = data.pop("gleditsch_ward")
ccode = data.pop("ccode")
region = data.pop("region")
region23 = data.pop("region23")
ged_sb_14 = data.pop("ged_sb_14")
country_id = data.pop("country_id")
date = data.pop("date")
month_id = data.pop("month_id")

print("Original shape:", data.shape)
data = data.loc[::, ~data.columns.str.contains("ged")]
data = data.loc[::, ~data.columns.str.contains("acled")]
print("Shape without GED/ACLED:", data.shape)

In [None]:
# T-SNE
if SEARCH_BEST_PERPLEXITY:
    min_perplexity = search_best_perplexity(data, perplexity_list=perplexity_list, n=2)
else:
    min_perplexity = 20

tsne = TSNE(perplexity=min_perplexity, n_jobs=-1)

res = tsne.fit_transform(data)

In [None]:
title_prefix = "t-SNE based on all cm_features excluding\ncountry_id, month_id and fatalities\nfeatures (ged_sb and acled)"

region_plot(res, region, title_prefix)
print("-" * 96)
print("-" * 96)
region_plot(res, region23, title_prefix)
print("-" * 96)
print("-" * 96)
if PLOT_INTERACTIVE:
    interactive_plots(res, region, ged_sb_14, gw_statename, date, title_prefix)
    print("-" * 96)
    print("-" * 96)
noninteractive_plots(res, region23, ged_sb_14, title_prefix)

# 3D

In [None]:
# Data
data = pd.read_csv("../data/cm_features_v3.6.csv")
data = data.loc[::, ~data.columns.str.startswith("stock_topic")]

data["ged_sb_14"] = data.groupby("ccode")["ged_sb"].shift(-PREDICTION_WINDOW)

data["armed_conflict_14"] = data["ged_sb_14"] / data["wdi_sp_pop_totl"] * 1_000_000

data = data.loc[data["ged_sb_14"].notna()]

ac = data.pop("armed_conflict_14")
gw_statename = data.pop("gw_statename")
gw = data.pop("gleditsch_ward")
ccode = data.pop("ccode")
region = data.pop("region")
region23 = data.pop("region23")
ged_sb_14 = data.pop("ged_sb_14")
country_id = data.pop("country_id")
date = data.pop("date")
month_id = data.pop("month_id")

In [None]:
# T-SNE
if SEARCH_BEST_PERPLEXITY:
    min_perplexity = search_best_perplexity(data, perplexity_list=perplexity_list, n=3)
else:
    min_perplexity = 20

tsne = TSNE(n_components=3, perplexity=min_perplexity, n_jobs=-1)

res = tsne.fit_transform(data)

In [None]:
title_prefix = "t-SNE based on all cm_features excluding\ncountry_id, month_id and dependent variable"

masks = [(reg, region == reg) for reg in region.unique()]

for reg, mask in masks:
    fig = px.scatter_3d(
        res[mask],
        x=res[::, 0][mask],
        y=res[::, 1][mask],
        z=res[::, 2][mask],
        size=(ged_sb_14[mask] > 0) * 3 + 1,
        color=ged_sb_14[mask],
        color_continuous_scale="Inferno",
        opacity=0.7,
        title=f"{title_prefix} in {reg}",
    )
    fig.show()

# NEWS

In [None]:
# Data
data = pd.read_csv("../data/cm_features_v3.6.csv")

stock_topic = data.loc[::, data.columns.str.startswith("stock_topic")]
mask = (stock_topic != 0).sum(axis=1) > 0
data = data.loc[mask, ::]

data["ged_sb_14"] = data.groupby("ccode")["ged_sb"].shift(-PREDICTION_WINDOW)

data["armed_conflict_14"] = data["ged_sb_14"] / data["wdi_sp_pop_totl"] * 1_000_000

data = data.loc[data["ged_sb_14"].notna()]

ac = data.pop("armed_conflict_14")
gw_statename = data.pop("gw_statename")
gw = data.pop("gleditsch_ward")
ccode = data.pop("ccode")
region = data.pop("region")
region23 = data.pop("region23")
ged_sb_14 = data.pop("ged_sb_14")
country_id = data.pop("country_id")
date = data.pop("date")
month_id = data.pop("month_id")

In [None]:
# T-SNE
if SEARCH_BEST_PERPLEXITY:
    min_perplexity = search_best_perplexity(data, perplexity_list=perplexity_list, n=2)
else:
    min_perplexity = 10

tsne = TSNE(n_components=2, perplexity=min_perplexity, n_jobs=-1)

res = tsne.fit_transform(data)

In [None]:
title_prefix = "t-SNE based on all cm_features and news excluding\ncountry_id, month_id and dependent variable"
noninteractive_plots(res, region23, ged_sb_14, title_prefix)

# Only News

In [None]:
# Data
data = pd.read_csv("../data/cm_features_v3.6.csv")

stock_topic = data.loc[::, data.columns.str.startswith("stock_topic")]
mask = (stock_topic != 0).sum(axis=1) > 0
data = data.loc[mask, ::]

data["ged_sb_14"] = data.groupby("ccode")["ged_sb"].shift(-PREDICTION_WINDOW)

data["armed_conflict_14"] = data["ged_sb_14"] / data["wdi_sp_pop_totl"] * 1_000_000

data = data.loc[data["ged_sb_14"].notna()]

ac = data.pop("armed_conflict_14")
gw_statename = data.pop("gw_statename")
gw = data.pop("gleditsch_ward")
ccode = data.pop("ccode")
region = data.pop("region")
region23 = data.pop("region23")
ged_sb_14 = data.pop("ged_sb_14")
country_id = data.pop("country_id")
date = data.pop("date")
month_id = data.pop("month_id")

data = data.loc[::, data.columns.str.startswith("stock_topic")]

In [None]:
# T-SNE
if SEARCH_BEST_PERPLEXITY:
    min_perplexity = search_best_perplexity(data, perplexity_list=perplexity_list, n=2)
else:
    min_perplexity = 40

tsne = TSNE(n_components=2, perplexity=min_perplexity, n_jobs=-1)

res = tsne.fit_transform(data)

In [None]:
title_prefix = "t-SNE based on all news"
noninteractive_plots(res, region23, ged_sb_14, title_prefix)

# 6 Figure Plots

In [None]:
# Data
data = pd.read_csv("../data/cm_features_v3.6.csv")

# Without News
cm_features = data.loc[::, ~data.columns.str.startswith("stock_topic")]

cm_features["ged_sb_14"] = cm_features.groupby("ccode")["ged_sb"].shift(-PREDICTION_WINDOW)

cm_features["armed_conflict_14"] = cm_features["ged_sb_14"] / cm_features["wdi_sp_pop_totl"] * 1_000_000

cm_features = cm_features.loc[cm_features["ged_sb_14"].notna()]

ac_without_news = cm_features.pop("armed_conflict_14")
gw_statename_without_news = cm_features.pop("gw_statename")
gw_without_news = cm_features.pop("gleditsch_ward")
ccode_without_news = cm_features.pop("ccode")
region_without_news = cm_features.pop("region")
region23_without_news = cm_features.pop("region23")
ged_sb_14_without_news = cm_features.pop("ged_sb_14")
country_id_without_news = cm_features.pop("country_id")
date_without_news = cm_features.pop("date")
month_id_without_news = cm_features.pop("month_id")

# --
cm_features

# --
cm_features_without_ged = cm_features.loc[::, ~cm_features.columns.str.contains("ged")]
cm_features_without_ged_acled = cm_features_without_ged.loc[::, ~cm_features_without_ged.columns.str.contains("acled")]

# With News
stock_topic = data.loc[::, data.columns.str.startswith("stock_topic")]
mask = (stock_topic != 0).sum(axis=1) > 0
data_with_news = data.loc[mask, ::]

data_with_news["ged_sb_14"] = data_with_news.groupby("ccode")["ged_sb"].shift(-PREDICTION_WINDOW)

data_with_news["armed_conflict_14"] = data_with_news["ged_sb_14"] / data_with_news["wdi_sp_pop_totl"] * 1_000_000

data_with_news = data_with_news.loc[data_with_news["ged_sb_14"].notna()]

ac_with_news = data_with_news.pop("armed_conflict_14")
gw_statename_with_news = data_with_news.pop("gw_statename")
gw_with_news = data_with_news.pop("gleditsch_ward")
ccode_with_news = data_with_news.pop("ccode")
region_with_news = data_with_news.pop("region")
region23_with_news = data_with_news.pop("region23")
ged_sb_14_with_news = data_with_news.pop("ged_sb_14")
country_id_with_news = data_with_news.pop("country_id")
date_with_news = data_with_news.pop("date")
month_id_with_news = data_with_news.pop("month_id")

# --
data_with_news

# --
data_with_news_without_ged = data_with_news.loc[::, ~data_with_news.columns.str.contains("ged")]
data_with_news_without_ged_acled = data_with_news_without_ged.loc[::, ~data_with_news_without_ged.columns.str.contains("acled")]

# --
news = data_with_news.loc[::, data_with_news.columns.str.startswith("stock_topic")]

# --
cm_features2010 = data_with_news.loc[::, ~data_with_news.columns.str.startswith("stock_topic")]

# --
cm_features2010_without_ged = cm_features2010.loc[::, ~cm_features2010.columns.str.contains("ged")]
cm_features2010_without_ged_acled = cm_features2010_without_ged.loc[::, ~cm_features2010_without_ged.columns.str.contains("acled")]

In [None]:
print("0/6")
tsne_cm_features = TSNE(perplexity=20, n_jobs=-1)
res_cm_features = tsne_cm_features.fit_transform(cm_features)

print("1/6")
tsne_cm_features_without_ged_acled = TSNE(perplexity=20, n_jobs=-1)
res_cm_features_without_ged_acled = tsne_cm_features_without_ged_acled.fit_transform(cm_features_without_ged_acled)

print("2/6")
tsne_news = TSNE(perplexity=40, n_jobs=-1)
res_news = tsne_news.fit_transform(news)

print("3/6")
tsne_data_with_news = TSNE(perplexity=10, n_jobs=-1)
res_data_with_news = tsne_data_with_news.fit_transform(data_with_news)

print("4/6")
tsne_cm_features2010 = TSNE(perplexity=10, n_jobs=-1)
res_cm_features2010 = tsne_cm_features2010.fit_transform(cm_features2010)

print("5/6")
tsne_cm_features2010_without_ged_acled = TSNE(perplexity=20, n_jobs=-1)
res_cm_features2010_without_ged_acled = tsne_cm_features2010_without_ged_acled.fit_transform(cm_features2010_without_ged_acled)

print("6/6")
tsne_data_with_news_without_ged_acled = TSNE(perplexity=20, n_jobs=-1)
res_data_with_news_without_ged_acled = tsne_data_with_news_without_ged_acled.fit_transform(data_with_news_without_ged_acled)

In [None]:
import os

if not os.path.exists("../figures/tsne-findings"):
    os.makedirs("../figures/tsne-findings")

def noninteractive_plots_2x3(
    reses_without_news,
    reses_with_news,
    region_without_news,
    region_with_news,
    ged_sb_14_without_news,
    ged_sb_14_with_news,
    title_prefixes_without_news,
    title_prefixes_with_news,
):
    for reg in region_without_news.unique():
        fig, axes = plt.subplots(2, 4, figsize=(18, 10))
        axes = axes.flatten()
    
        idx = 0
        
        for reses, region, ged_sb_14, title_prefixes in [
            (reses_without_news, region_without_news, ged_sb_14_without_news, title_prefixes_without_news),
            (reses_with_news, region_with_news, ged_sb_14_with_news, title_prefixes_with_news)
        ]:
            mask = region == reg
  
            for res, title_prefix in zip(reses, title_prefixes):
                ax = axes[idx]
                
                ax.set_xlim(res[:, 0].min(), res[:, 0].max())
                ax.set_ylim(res[:, 1].min(), res[:, 1].max())
                
                # Define normalization and colormap
                norm = colors.Normalize(vmin=ged_sb_14[mask].min(), vmax=ged_sb_14[mask].max())
                cmap = plt.cm.Reds
    
                ax.scatter(
                    res[:, 0][mask],
                    res[:, 1][mask],
                    c=ged_sb_14[mask],
                    alpha=(ged_sb_14[mask] > 0) * 0.7 + 0.10,
                    s=ged_sb_14[mask] / ged_sb_14[mask].max() * 50 + 5,
                    cmap=cmap,
                    norm=norm,
                    edgecolors='black',  # Adds a thin black border
                    linewidths=0.5  # Sets the border thickness
                )
    
                # Create sample values for legend
                sample_values = np.linspace(ged_sb_14[mask].min(), ged_sb_14[mask].max(), num=4)
                sample_sizes = sample_values / ged_sb_14[mask].max() * 50 + 5
    
                # Create legend entries
                legend_elements = []
    
                for val, size in zip(sample_values, sample_sizes):
                    handle = ax.scatter([], [], s=size, c=[cmap(norm(val))],
                                        edgecolors='black', linewidths=0.5)
                    legend_elements.append((handle, int(val)))
    
                ax.legend([h for h, _ in legend_elements], [l for _, l in legend_elements],
                          title='Size & Color\n(battle-related\nged_sb deaths)', 
                          loc="upper right")
    
                ax.set_title(f"{title_prefix}")
                
                idx += 1


        fig.delaxes(axes[idx])
        fig.suptitle(f"{reg} T-SNE based on:", fontsize=16)
        plt.tight_layout()
        plt.savefig(f"../figures/tsne-findings/{reg} clusters.png", dpi=300)
        plt.show()

In [None]:
reses_without_news = [
    res_cm_features,
    res_cm_features_without_ged_acled,
]

title_prefixes_without_news = [
    "cm_features since 1990 excluding\nmonth_id, country_id and dependent variable",
    "cm_features since 1990 excluding\nmonth_id, country_id, dependent variable,\nged and acled columns",
]

reses_with_news = [
    res_news,
    res_cm_features2010,
    res_data_with_news,
    res_cm_features2010_without_ged_acled,
    res_data_with_news_without_ged_acled
]

title_prefixes_with_news = [
    "news since 2010",
    "cm_features since 2010 excluding\nmonth_id, country_id and dependent variable",
    "cm_features & news since 2010 excluding\nmonth_id, country_id and dependent variable",
    "cm_features since 2010 excluding\nmonth_id, country_id, dependent variable,\nged and acled columns",
    "cm_features & news since 2010 excluding\nmonth_id, country_id, dependent variable,\nged and acled columns"
]

noninteractive_plots_2x3(
    reses_without_news,
    reses_with_news,
    region_without_news,
    region_with_news,
    ged_sb_14_without_news,
    ged_sb_14_with_news,
    title_prefixes_without_news,
    title_prefixes_with_news,
)

# Similarities

In [None]:
# TODO: find similarities for cm_features and news since 2010 + output dates that are similar

# Data
data = pd.read_csv("../data/cm_features_v3.6.csv")
data = data.loc[::, ~data.columns.str.startswith("stock_topic")]

data["ged_sb_14"] = data.groupby("ccode")["ged_sb"].shift(-PREDICTION_WINDOW)

data["armed_conflict_14"] = data["ged_sb_14"] / data["wdi_sp_pop_totl"] * 1_000_000

data = data.loc[data["ged_sb_14"].notna()]

ac = data.pop("armed_conflict_14")
gw_statename = data.pop("gw_statename")
gw = data.pop("gleditsch_ward")
ccode = data.pop("ccode")
region = data.pop("region")
region23 = data.pop("region23")
ged_sb_14 = data.pop("ged_sb_14")
country_id = data.pop("country_id")
date = data.pop("date")
month_id = data.pop("month_id")

In [None]:
# T-SNE
if SEARCH_BEST_PERPLEXITY:
    min_perplexity = search_best_perplexity(data, perplexity_list=perplexity_list, n=2)
else:
    min_perplexity = 20

tsne = TSNE(perplexity=min_perplexity, n_jobs=-1)

res = tsne.fit_transform(data)

In [None]:
import numpy as np
from sklearn.cluster import DBSCAN

def find_clusters(points, threshold):
    # Initialize DBSCAN with the given threshold (eps) and min_samples=1
    db = DBSCAN(eps=threshold, min_samples=1, algorithm='kd_tree', n_jobs=-1)
    # Fit the model to your data
    db.fit(points)
    # Extract labels assigned by DBSCAN
    labels = db.labels_
    # Number of clusters in labels, ignoring noise if present
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    print(f'Estimated number of clusters: {n_clusters}')

    # Group points by their cluster labels
    clusters = []
    indexes = []
    for cluster_id in range(n_clusters):
        # Get the indices of points belonging to the current cluster
        indices = np.where(labels == cluster_id)[0]
        indexes.append(indices)
        # Extract the points
        cluster_points = points[indices]
        clusters.append(cluster_points)
    return clusters, indexes
    
threshold = .000001

clusters, indexes = find_clusters(res, threshold)

In [None]:
multiple_countries = []

for i in indexes:
    if len(gw_statename.iloc[i].unique()) > 1:
        print("-" * 16)
        multiple_countries.append(i)
        print(gw_statename.iloc[i].unique())

In [None]:
index = 0
pd.concat([ged_sb_14.iloc[multiple_countries[index]], gw_statename.iloc[multiple_countries[index]], date.iloc[multiple_countries[index]], data.iloc[multiple_countries[index]]], axis=1).to_csv("t.csv")

In [None]:
indexes2 = list(filter(lambda array: len(array) > 2, indexes))
len(indexes2)

In [None]:
index = 1000
pd.concat([ged_sb_14.iloc[indexes2[index]], gw_statename.iloc[indexes2[index]], date.iloc[indexes2[index]], data.iloc[indexes2[index]]], axis=1).to_csv("a.csv")

In [None]:
# cosine_similarity = (res @ res.T) / np.linalg.norm(res, axis=1) / np.linalg.norm(point).T