In [None]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

# Upload the files to Google Colab
from google.colab import files
uploaded = files.upload()

# Check the uploaded files
print(uploaded)

# Load the datasets from the uploaded files
training_data = pd.read_csv('twitter_training.csv', header=None)
validation_data = pd.read_csv('twitter_validation.csv', header=None)

# Display the first few rows of each dataset
print(training_data.head())
print(validation_data.head())

# Distribution of sentiments in the training data
sentiment_counts = training_data[2].value_counts()

# Plot the distribution of sentiments
plt.figure(figsize=(10, 6))
sns.barplot(x=sentiment_counts.index, y=sentiment_counts.values, palette="viridis")
plt.title('Distribution of Sentiments in Training Data')
plt.xlabel('Sentiment')
plt.ylabel('Number of Tweets')
plt.show()

# Distribution of sentiments per entity in the training data
entity_sentiment_counts = training_data.groupby([1, 2]).size().unstack()

# Plot the distribution of sentiments per entity
plt.figure(figsize=(14, 10))
entity_sentiment_counts.plot(kind='bar', stacked=True, colormap='viridis', figsize=(14, 10))
plt.title('Distribution of Sentiments per Entity in Training Data')
plt.xlabel('Entity')
plt.ylabel('Number of Tweets')
plt.legend(title='Sentiment')
plt.show()

# Function to generate and display a word cloud
def generate_word_cloud(data, sentiment):
    text = " ".join(tweet for tweet in data[data[2] == sentiment][3].astype(str).fillna(''))
    wordcloud = WordCloud(width=800, height=400, random_state=21, max_font_size=110, background_color='white').generate(text)
    plt.figure(figsize=(10, 7))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(f'Word Cloud for {sentiment} Tweets')
    plt.show()

# Generate word clouds for each sentiment
for sentiment in sentiment_counts.index:
    generate_word_cloud(training_data, sentiment)

