In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

df = pd.read_csv("../data/all_data.csv")
print(df.info())
df.head()

In [None]:
df.isna().sum()

# Y distribution

In [None]:
sns.countplot(data=df, x="Sentiment")

In [None]:
df["word_count"] = df["Sentence"].apply(lambda x: len(x.split()))

In [None]:
sns.violinplot(data=df, x="Sentiment", y="word_count")

In [None]:
grouped_df = df.groupby(by="Sentiment")
freq_df = pd.DataFrame()

for sentiment, group in grouped_df:
    grouped_str = group['Sentence'].astype("string").str.cat(sep=". ")
    grouped_str_df = pd.Series(grouped_str.split()).apply(lambda x: x.lower())

    mask2 = lambda x: x.isalpha()
    grouped_str_df = grouped_str_df[grouped_str_df.apply(mask2)]

    frequencies = grouped_str_df.value_counts()
    freq_df[sentiment] = frequencies

freq_df.reset_index(names="word", inplace=True)
freq_df_long = freq_df.melt(id_vars='word', 
                  value_vars=['negative', 'neutral', 'positive'],
                  var_name='sentiment', 
                  value_name='frequency')
freq_df_long.sort_values("frequency")
print(freq_df)
print(freq_df_long)


WANTED_FREQ = 5
print(f"Number of words having frequency higher than '{WANTED_FREQ}': {[sum(freq_df[col] > WANTED_FREQ) for col in freq_df if col != "word"]}")


In [None]:
from wordcloud import WordCloud

negative_freq_df = freq_df[["word", "negative"]].sort_values("negative", ascending=False)
sns.barplot(x="negative", y="word", data=negative_freq_df[:20], palette="viridis")
plt.show()

negative_cloud_dct = dict(negative_freq_df.set_index("word")["negative"])
negative_cloud = WordCloud(
    width=1600,
    height=900,
    background_color="white",
    colormap="viridis"
).generate_from_frequencies(negative_cloud_dct)
plt.imshow(negative_cloud, interpolation="bilinear")
plt.axis("off")
plt.figure(figsize=(32,18))

In [None]:
positive_freq_df = freq_df[["word", "positive"]].sort_values("positive", ascending=False)
sns.barplot(x="positive", y="word", data=positive_freq_df[:20], palette="viridis")
plt.show()

positive_cloud_dct = dict(positive_freq_df.set_index("word")["positive"])
positive_cloud = WordCloud(
    width=1600,
    height=900,
    background_color="white",
    colormap="viridis"
).generate_from_frequencies(positive_cloud_dct)
plt.imshow(positive_cloud, interpolation="bilinear")
plt.axis("off")
plt.figure(figsize=(32,18))

In [None]:
neutral_freq_df = freq_df[["word", "neutral"]].sort_values("neutral", ascending=False)
sns.barplot(x="neutral", y="word", data=neutral_freq_df[:20], palette="viridis")
plt.show()

neutral_cloud_dct = dict(neutral_freq_df.set_index("word")["neutral"])
neutral_cloud = WordCloud(
    width=1600,
    height=900,
    background_color="white",
    colormap="viridis"
).generate_from_frequencies(neutral_cloud_dct)
plt.imshow(neutral_cloud, interpolation="bilinear")
plt.axis("off")
plt.figure(figsize=(32,18))
