<a href="https://colab.research.google.com/github/goodwillhunting9/AI-Driven-Food-Security-Platform/blob/main/AI_Assignment2_21386825.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Step 1: Upload the file manually in Google Colab or use the path if it's already in Colab

import pandas as pd

# Step 2: Define the file path (update this if the file is uploaded via Colab interface)
file_path = '/content/Twitter_Dataset.xlsx'

# Step 3: Load the third sheet (sheet index starts from 0, so third sheet is index 2)
df = pd.read_excel(file_path, sheet_name=2)

# Step 4: Display the first few rows of the data to verify
df.head()


Load the dataset.

In [None]:
# Load the dataset (third tab)
file_path = '/content/Twitter_Dataset.xlsx'
df = pd.read_excel(file_path, sheet_name=2)

# Check the column names
print(df.columns)


Step 1: Update the column reference in the code

In [None]:
import re
import pandas as pd
import nltk
from nltk.corpus import stopwords
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Download stopwords from nltk
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Function to clean the text
def clean_text(text):
    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    # Remove mentions and hashtags
    text = re.sub(r'\@\w+|\#', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove special characters and punctuations
    text = re.sub(r'[^\w\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Remove stopwords
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

# Apply the cleaning function to the 'Text' column
df['cleaned_text'] = df['Text'].apply(clean_text)

# Word Frequency
word_list = ' '.join(df['cleaned_text']).split()
word_freq = Counter(word_list)

# Display the most common words
print(word_freq.most_common(10))

# Word Cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(' '.join(word_list))
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()


N-Gram Analysis

In [None]:
from nltk import ngrams

# Create bigrams (2-word combinations)
bigrams = list(ngrams(' '.join(df['cleaned_text']).split(), 2))
bigram_freq = Counter(bigrams)

# Display the most common bigrams
print(bigram_freq.most_common(10))


Bi Grams

In [None]:
from nltk import ngrams

# Create bigrams
bigrams = list(ngrams(' '.join(df['cleaned_text']).split(), 2))
bigram_freq = Counter(bigrams)

# Display the most common bigrams
print(bigram_freq.most_common(10))


Visualising using a bar plot

In [None]:
import matplotlib.pyplot as plt

# Prepare data for plotting
bigram_data = bigram_freq.most_common(10)
labels, values = zip(*bigram_data)

# Convert bigram tuples to strings for plotting
labels = [' '.join(bigram) for bigram in labels]

# Plot
plt.figure(figsize=(10, 5))
plt.barh(labels, values)
plt.title('Top 10 Bigrams')
plt.xlabel('Frequency')
plt.ylabel('Bigrams')
plt.show()


Tweet Count Stats over time

In [None]:
# Convert 'Timestamp' to datetime
df['Timestamp'] = pd.to_datetime(df['Timestamp'])

# Group by day and count tweets
tweet_counts = df.groupby(df['Timestamp'].dt.date)['Text'].count()

# Plot tweet counts over time
plt.figure(figsize=(10, 5))
tweet_counts.plot(kind='line')
plt.title('Tweet Counts Over Time')
plt.xlabel('Date')
plt.ylabel('Number of Tweets')
plt.show()



Sentiment Analysis:


In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
sid = SentimentIntensityAnalyzer()

# Apply sentiment analysis
df['sentiment'] = df['cleaned_text'].apply(lambda x: sid.polarity_scores(x)['compound'])

# Group sentiment by day
sentiment_over_time = df.groupby(df['Timestamp'].dt.date)['sentiment'].mean()

# Plot sentiment over time
plt.figure(figsize=(10, 5))
sentiment_over_time.plot(kind='line')
plt.title('Sentiment Over Time')
plt.xlabel('Date')
plt.ylabel('Sentiment Score')
plt.show()


Topic Modelling

In [None]:
import gensim
from gensim import corpora

# Prepare data for LDA
texts = [text.split() for text in df['cleaned_text']]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

# Train LDA model
lda_model = gensim.models.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=10)

# Display topics
for idx, topic in lda_model.print_topics(-1):
    print(f'Topic: {idx} \nWords: {topic}')


5. Creative Insight Extraction

In [None]:
!pip install transformers

from transformers import pipeline

# Load sentiment analysis pipeline from Hugging Face (you can choose other models too)
sentiment_model = pipeline('sentiment-analysis')

# Analyze a sample tweet
result = sentiment_model(df['cleaned_text'].iloc[0])
print(result)


In [None]:
import re
import pandas as pd
import nltk
from nltk.corpus import stopwords
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Download stopwords from nltk
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Function to clean the text
def clean_text(text):
    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    # Remove mentions and hashtags
    text = re.sub(r'\@\w+|\#', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove special characters and punctuations
    text = re.sub(r'[^\w\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Remove stopwords
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

# Apply the cleaning function to the 'Text' column
df['cleaned_text'] = df['Text'].apply(clean_text)

# Word Frequency
word_list = ' '.join(df['cleaned_text']).split()
word_freq = Counter(word_list)

# Display the most common words
print(word_freq.most_common(10))

# Word Cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(' '.join(word_list))
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
