In [None]:
import plotly.express as px

In [None]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
import pandas as pd

# Load the dataset
dataset = load_dataset("fathyshalab/mdsci",use_auth_token=True)

dataset


In [None]:


# Get the number of words in each text
dataset["train"] = dataset["train"].map(lambda example: {"text": example["text"], "label": example["label"],"domain":example["domain"], "num_words": len(example["text"].split())})
dataset["test"] = dataset["test"].map(lambda example: {"text": example["text"], "label": example["label"],"domain":example["domain"], "num_words": len(example["text"].split())})


In [None]:
train_df = pd.DataFrame(dataset["train"])
test_df = pd.DataFrame(dataset["test"])
train_df

### Stats about the dataset

In [None]:
# Import required libraries
import pandas as pd

# Create an empty dataframe to store the results
latex_table = pd.DataFrame(columns=['Domain', 'Intent', 'Sample'])

# Iterate over unique domains in the train_df dataframe
for domain in train_df['domain'].unique():
    # Iterate over unique intents within each domain
    for intent in train_df[train_df['domain'] == domain]['label_name'].unique():
        # Get the first sample for the current domain and intent
        sample = train_df[(train_df['domain'] == domain) & (train_df['label_name'] == intent)]['text'].iloc[0]
        # Replace line breaks with LaTeX line break command
        # Append the domain, intent, and sample to the latex_table dataframe
        latex_table = latex_table.append({'Domain': domain, 'Intent': intent, 'Sample': sample}, ignore_index=True)

# Convert the dataframe to LaTeX format

df =latex_table
# Select the first row per domain
smaller_df = df.groupby('Domain').first().reset_index()
with open("my_table_small.tex", "w") as f:
    f.write("\\begin{tabular}{" + " | ".join(["c"] * len(smaller_df.columns)) + "}\n")
    for i, row in smaller_df.iterrows():
        f.write(" & ".join([str(x) for x in row.values]) + " \\\\\n")
    f.write("\\end{tabular}")




In [None]:
print(latex)

In [None]:
latex_table["Sample"][0]

In [None]:
dataframe = train_df

In [None]:
fig = px.histogram(dataframe, x='domain', template='plotly_white', title='Queries counts by Domain')
fig.update_xaxes(categoryorder='total descending').update_yaxes(title='DOMAIN')
fig.show()

In [None]:
fig = px.histogram(dataframe, x='label_name', template='plotly_white', title='Queries counts by Intent')
fig.update_xaxes(categoryorder='total descending').update_yaxes(title='INTENT')
fig.show()

In [None]:
fig = px.histogram(dataframe, x='num_words', template='plotly_white', title='Queries counts by word count')
fig.update_xaxes(categoryorder='total descending').update_yaxes(title='Number of Queries')
fig.show()

In [None]:
# Complaints by company & date
fig = px.histogram(dataframe, x='label_name', template='plotly_white', title='Queries counts by intent'
                   , color='domain')
fig.update_xaxes(categoryorder='category descending', title='Intent').update_xaxes(title='Number of queries')
fig.show()

In [None]:
fig = px.treemap(dataframe, title='Treemap chart by domain and the corresponding intent with the average n_words',
                 path=['domain', 'label_name'], color='num_words', color_continuous_scale=px.colors.sequential.GnBu, width=2048)
fig.show()

## Qualitative comparison

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords as nltk_stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

def perform_tfidf_analysis(texts, intent):
    """
    Performs TF-IDF analysis on a list of texts.

    Args:
    - texts (list): List of strings containing the texts

    Returns:
    - None
    """
    # Remove German stopwords from texts
    stopwords = nltk_stopwords.words('german')
    texts = [' '.join([word for word in text.split() if word.lower() not in stopwords]) for text in texts]

    # Vectorize the texts using TF-IDF
    tfidf = TfidfVectorizer()
    tfidf_matrix = tfidf.fit_transform(texts)

    # Compute document similarities
    similarity_matrix = cosine_similarity(tfidf_matrix)

    # Identify common and rare words
    word_scores = pd.DataFrame(tfidf_matrix.sum(axis=0), columns=tfidf.get_feature_names_out()).T
    word_scores.columns = ['tfidf_score']
    word_scores = word_scores.sort_values('tfidf_score', ascending=False)

    # Visualize the results
    fig, axes = plt.subplots(1, 2, figsize=(12, 6))

    # Scatter plot of similarity scores
    similarity_matrix = [i for i in similarity_matrix if i[1] < 0.99]
    sns.scatterplot(x=[i[0] for i in similarity_matrix], y=[i[1] for i in similarity_matrix], ax=axes[0])
    axes[0].set_xlabel('Sample Index')
    axes[0].set_ylabel('Similarity Score')
    axes[0].set_title('Similarity Scores between Samples')

    # Bar plot of top 20 words by TF-IDF score
    sns.barplot(x=word_scores.head(20).tfidf_score, y=word_scores.head(20).index, ax=axes[1])
    axes[1].set_xticklabels(axes[1].get_xticklabels(), rotation=90)
    axes[1].set_xlabel('TF-IDF Score')
    axes[1].set_ylabel('Word')
    axes[1].set_title('Top 20 Words by TF-IDF Score')

    plt.tight_layout()
    plt.savefig(f"tfid-{intent}.png")
    plt.show()


In [None]:
for intent in train_df['label_name'].unique():
    texts = train_df[train_df['label_name'] == intent]['text'].tolist()
    perform_tfidf_analysis(texts, intent)

In [None]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, balanced_accuracy_score, accuracy_score,f1_score
from sentence_transformers import SentenceTransformer


In [None]:
def baseline(dmo,dataset):
    """Running the baseline which in this case is the embeeding and logistic regression steps combined"""
    model_name = f"fathyshalab/reklambox-{dmo}-setfit"
    embedding_model = SentenceTransformer(model_name, use_auth_token=True)
    train_embeedings = embedding_model.encode(dataset["train"]["text"])
    test_embeedings = embedding_model.encode(dataset["test"]["text"])
    X = np.array(list(train_embeedings))
    y = dataset["train"]["label"]
    y_test = dataset["test"]["label"]
    X_test = np.array(list(test_embeedings))

    # Train a logistic regression model on the averaged embeddings
    clf = LogisticRegression(random_state=42).fit(X, y)

    # Make predictions on the test set
    y_pred = clf.predict(X_test)

    # Print the classification report
    # print(classification_report(y_test, y_pred))
    f1 =  f1_score(y_test, y_pred, average="weighted")
    accuracy = accuracy_score(y_test, y_pred)
    balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
    return f1,accuracy,balanced_accuracy

In [None]:
from datasets import load_dataset


In [None]:
dms = [
    "supermaerkte-drogerien",
    "mode-schmuck-zubehoer",
    "moebel-einrichtungshaeuser",
    "finanzen",
    "reisen-tourismus",
    "schoenheit-wellness",
    "unternehmen-verbaende",
    "medizin-gesundheit-pflege",
    "transport-logistik",
    "versicherungen-recht",
    "oeffentlichkeit-soziales",
    "oeffentlicher-verkehr-vermietung",
    "unterhaltung-kultur-freizeit",
    "wasser-strom-gas",
    "haus-reinigung",
]
from tqdm import tqdm
metricss ={dm:{} for dm in dms}
for dm in tqdm(dms):
    dataset = load_dataset(f"fathyshalab/mdcsi_{dm}", use_auth_token=True)
    f,a,b = baseline(dm,dataset)
    metricss[dm]["f1"]=f
    metricss[dm]["accuracy"]=a
    metricss[dm]["balanced_accuracy"]=b


metricss
    


In [None]:
# take metricss and plot them
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
df = pd.DataFrame(metricss).T
df = df.reset_index()
df = df.rename(columns={"index":"domain"})
df = df.melt(id_vars=["domain"],  var_name="metric", value_name="value")
df["metric"] = df["metric"].str.replace("_"," ")
df["metric"] = df["metric"].str.title()
df["metric"] = df["metric"].str.replace("F1","F1-Score")
df["metric"] = df["metric"].str.replace("Accuracy","Accuracy-Score")
df["metric"] = df["metric"].str.replace("Balanced Accuracy","Balanced Accuracy-Score")
df["metric"] = df["metric"].str.replace(" ","\n")
df["metric"] = df["metric"].str.replace("-","- ")
# plot
sns.set_theme(style="whitegrid")
plt.figure(figsize=(20, 10))
ax = sns.barplot(x="value", y="domain", hue="metric", data=df)
ax.set_xticklabels(ax.get_xticklabels())
plt.savefig("baseline.png")
plt.legend(loc='lower right')
plt.show()


In [None]:
import json
import altair as alt
from matplotlib import pyplot as plt
import pandas as pd
with open("setfit-soupres5-new.json","r") as f :
    data = json.load(f)

In [None]:
for d in data.keys():
    for md in metricss.keys():
        if d==md:
            data[d]["orig"]=metricss[md]

data['supermaerkte-drogerien']

In [None]:
alt.data_transformers.disable_max_rows()

In [None]:
data

In [None]:
table_rows = []
metrics = ['f1', 'accuracy', 'balanced_accuracy']

# Iterate over the data
for domain_name, domain_data in data.items():

    for metric in metrics:
        # Find the metric with the highest value for that metric that is not the baseline but contains the domain name
        max_metric = max(domain_data, key=lambda x: domain_data[x][metric] if x != 'orig' and domain_name in x else 0)
        # Get the value of the metric
        value = domain_data[max_metric][metric]
        baseline_value = domain_data['orig'][metric]

        # Determine the improvement, decrease, or no change
        if value > baseline_value:
            improvement = 'Improvement'
        elif value < baseline_value:
            improvement = 'Decrease'
        else:
            improvement = 'No Change'

        percent_diff = (value - baseline_value) / baseline_value * 100
        # Add the row to the table
        table_rows.append([domain_name, metric, value,baseline_value,max_metric,improvement,percent_diff])

# Create a pandas DataFrame with the table rows
df = pd.DataFrame(table_rows, columns=['Domain', 'Metric', ' SETFIT Value',"Baseline Value","Combo Name",'Change','% Difference'],index=None)
# export the table for powerpoint
df.to_csv("setfit-soupres5new.csv",index=None)

In [None]:
df.to_latex("setfit-soupres5new.tex",index=None)

In [None]:
#make smaller df where i just have the domain and say if there was an improvement or not
df2 = df[["Domain","Change"]]
df2 = df2.groupby(["Domain","Change"])
df2 = df2.size().reset_index(name='counts')
df2.to_latex("setfit-soupres5new-small.tex",index=None)

In [None]:
# Convert the data to a tidy dataframe
df = pd.DataFrame([(category, subcategory, metric, scores[metric])
                   for category, subcategories in data.items()
                   for subcategory, scores in subcategories.items()
                   for metric in scores.keys()],
                  columns=['Category', 'Subcategory', 'Metric', 'Value'])
df

In [None]:
df.to_latex("setfit-soupres5new.tex",index=None)

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Convert the data to a tidy dataframe
df = pd.DataFrame([(category, subcategory, metric, scores[metric])
                   for category, subcategories in data.items()
                   for subcategory, scores in subcategories.items()
                   for metric in scores.keys()],
                  columns=['Category', 'Subcategory', 'Metric', 'Value'])

# Create a dropdown selection for the category
category_selection = alt.selection_single(
    name='CategorySelector',
    fields=['Category'],
    bind=alt.binding_select(options=list(data.keys())),
    init={'Category': list(data.keys())[0]}
)

# Create a list to store the bar plots
bar_plots = []

# Iterate over each domain
for domain, domain_data in data.items():
    # Create a subset of the data for the current domain
    domain_df = df[df['Category'] == domain]
    
    # Create a separate bar plot for each metric in the current domain
    for metric in domain_data['orig'].keys():
        # Filter the data for the current metric
        metric_data = domain_df[domain_df['Metric'] == metric]
        
        # Select the top 10 values for the current metric within the domain
        top_10_data = metric_data.nlargest(10, 'Value')
        
        # Create a bar plot for the current metric within the domain
        plt.figure(figsize=(6, 4))
        sns.barplot(data=top_10_data, x='Value', y='Subcategory', hue='Category',
                    dodge=False)
        plt.title(f"{metric} - {domain}")
        plt.xlabel(metric)
        plt.ylabel("Subcategory")
        plt.tight_layout()
        
        # Add the bar plot to the list
        bar_plots.append(plt)

# Show the bar plots for each domain
for bar_plot in bar_plots:
    bar_plot.show()
